From 2ae1d698788eb8f484ae21f646b29a1439d58905 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 4 Jun 2024 13:38:11 +0100 Subject: [PATCH 1/6] [VPlan] Delay adding canonical IV increment and exit branches. This patch delays adding canonical IV increments, computing the next active lane mask and the branch recipes to exit the vector loop. During initial construction of a VPlan, only add the canonical IV phi and active-lane-mask phis if needed. Similarly, do not add the branches to exit the loop initially. Computing the next IV value, the next active-lane-mask or the exit branches are details that are only needed to simplify codegen (execute of the individual recipes). This makes the inital VPlans more abstract (and simpler), in that we initially leave out some details. Initial VPlans still have the canonical induction recipe, which provides the value of the canonical induction at every iteration, but we leave out the detail of how it is computed, which is not needed until code generation. Similar reasoning applies to active lane mask increments and branch recipes. The vector loop region initially abstractly models the loop processing all vector iterations, without specifying how exactly exiting the region is handled. Again these details are only needed for code generation. To introduce the missing pieces and complete the abstract VPlan, a new prepareToExecute transform is added and run just before exiting the VPlan. This is an attempt to further employ gradual lowering, as outlined in https://llvm.org/devmtg/2023-10/slides/techtalks/Hahn-VPlan-StatusUpdateAndRoadmap.pdf and already applied for replicate region handling. --- .../Transforms/Vectorize/LoopVectorize.cpp | 37 +++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 + llvm/lib/Transforms/Vectorize/VPlan.h | 3 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 125 +++++++++++------- .../Transforms/Vectorize/VPlanTransforms.h | 6 + .../Transforms/Vectorize/VPlanVerifier.cpp | 35 ++--- llvm/lib/Transforms/Vectorize/VPlanVerifier.h | 5 +- .../AArch64/scalable-strict-fadd.ll | 8 +- .../sve-interleaved-masked-accesses.ll | 6 +- .../AArch64/sve-tail-folding-forced.ll | 2 +- .../sve-tail-folding-overflow-checks.ll | 2 +- .../AArch64/sve-tail-folding-reductions.ll | 6 +- .../AArch64/sve-tail-folding-unroll.ll | 38 +++++- .../LoopVectorize/AArch64/sve-tail-folding.ll | 20 +-- .../LoopVectorize/AArch64/sve-widen-gep.ll | 2 +- .../AArch64/synthesize-mask-for-call.ll | 24 ---- .../AArch64/tail-folding-styles.ll | 2 +- .../AArch64/uniform-args-call-variants.ll | 22 +-- .../widen-call-with-intrinsic-or-libfunc.ll | 9 -- .../RISCV/riscv-vector-reverse.ll | 8 -- ...-order-recurrence-sink-replicate-region.ll | 12 +- .../Transforms/LoopVectorize/icmp-uniforms.ll | 2 +- .../LoopVectorize/vplan-dot-printing.ll | 2 +- .../LoopVectorize/vplan-iv-transforms.ll | 2 +- .../LoopVectorize/vplan-printing.ll | 30 ++--- .../vplan-sink-scalars-and-merge-vf1.ll | 2 +- .../vplan-sink-scalars-and-merge.ll | 28 ++-- .../vplan-unused-interleave-group.ll | 2 +- 28 files changed, 237 insertions(+), 206 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 37b8023e1fcf2..3664fd44f8652 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7428,6 +7428,14 @@ LoopVectorizationPlanner::executePlan( "expanded SCEVs to reuse can only be used during epilogue vectorization"); (void)IsEpilogueVectorization; + TailFoldingStyle Style = + CM.getTailFoldingStyle(!isIndvarOverflowCheckKnownFalse(&CM, BestVF)); + // When not folding the tail, we know that the induction increment will not + // overflow. + bool HasNUW = Style == TailFoldingStyle::None; + bool WithoutRuntimeCheck = + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + VPlanTransforms::finalizePlan(BestVPlan, HasNUW, WithoutRuntimeCheck); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF @@ -8467,17 +8475,16 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) break; - assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); + assert(verifyVPlanIsValid(*Plan, /*IsAbstract*/ true) && + "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } VF = SubRange.End; } } -// Add the necessary canonical IV and branch recipes required to control the -// loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, - DebugLoc DL) { +// Add the required canonical IV. +static void addCanonicalIV(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddLiveIn(StartIdx); @@ -8486,17 +8493,6 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); - - VPBuilder Builder(TopRegion->getExitingBasicBlock()); - // Add a VPInstruction to increment the scalar canonical IV by VF * UF. - auto *CanonicalIVIncrement = Builder.createOverflowingOp( - Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, - "index.next"); - CanonicalIVPHI->addOperand(CanonicalIVIncrement); - - // Add the BranchOnCount VPInstruction to the latch. - Builder.createNaryOp(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8555,7 +8551,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // When not folding the tail, we know that the induction increment will not // overflow. bool HasNUW = Style == TailFoldingStyle::None; - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + addCanonicalIV(*Plan, Legal->getWidestInductionType(), HasNUW, DL); VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); @@ -8806,9 +8802,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Tail folding is not supported for outer loops, so the induction increment // is guaranteed to not wrap. bool HasNUW = true; - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, - DebugLoc()); - assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); + addCanonicalIV(*Plan, Legal->getWidestInductionType(), HasNUW, DebugLoc()); + assert(verifyVPlanIsValid(*Plan, /*IsAbstract*/ true) && "VPlan is invalid"); return Plan; } @@ -8991,7 +8986,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( PreviousLink = RedRecipe; } } - Builder.setInsertPoint(&*LatchVPBB->begin()); + Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f17be451e6846..8e4df6c3caa92 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -20,6 +20,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "VPlanPatternMatch.h" +#include "VPlanVerifier.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -841,6 +842,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { + assert(verifyVPlanIsValid(*this, /*IsAbstract*/ false) && "VPlan is invalid"); + // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5bb88e4a57dc3..ebe432e13875c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2580,7 +2580,8 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { VPCanonicalIVPHIRecipe *clone() override { auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc()); - R->addOperand(getBackedgeValue()); + if (getNumOperands() == 2) + R->addOperand(getBackedgeValue()); return R; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8ec67eb2f54bd..27ff80b06a37d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1235,16 +1235,10 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); - VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); - auto *CanonicalIVIncrement = - cast(CanonicalIVPHI->getBackedgeValue()); - // TODO: Check if dropping the flags is needed if - // !DataAndControlFlowWithoutRuntimeCheck. - CanonicalIVIncrement->dropPoisonGeneratingFlags(); - DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); + DebugLoc DL = CanonicalIVPHI->getDebugLoc(); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at // Part * VF @@ -1254,21 +1248,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Create the ActiveLaneMask instruction using the correct start values. VPValue *TC = Plan.getTripCount(); - VPValue *TripCount, *IncrementValue; - if (!DataAndControlFlowWithoutRuntimeCheck) { - // When the loop is guarded by a runtime overflow check for the loop - // induction variable increment by VF, we can increment the value before - // the get.active.lane mask and use the unmodified tripcount. - IncrementValue = CanonicalIVIncrement; - TripCount = TC; - } else { - // When avoiding a runtime check, the active.lane.mask inside the loop - // uses a modified trip count and the induction variable increment is - // done after the active.lane.mask intrinsic is called. - IncrementValue = CanonicalIVPHI; - TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, - {TC}, DL); - } auto *EntryIncrement = Builder.createOverflowingOp( VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, "index.part.next"); @@ -1282,24 +1261,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // preheader ActiveLaneMask instruction. auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); LaneMaskPhi->insertAfter(CanonicalIVPHI); - - // Create the active lane mask for the next iteration of the loop before the - // original terminator. - VPRecipeBase *OriginalTerminator = EB->getTerminator(); - Builder.setInsertPoint(OriginalTerminator); - auto *InLoopIncrement = - Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, {false, false}, DL); - auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); - LaneMaskPhi->addOperand(ALM); - - // Replace the original terminator with BranchOnCond. We have to invert the - // mask here because a true condition means jumping to the exit block. - auto *NotMask = Builder.createNot(ALM, DL); - Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); - OriginalTerminator->eraseFromParent(); return LaneMaskPhi; } @@ -1418,6 +1379,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); + VPBasicBlock *Latch = Plan.getVectorLoopRegion()->getExitingBasicBlock(); // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); @@ -1426,22 +1388,18 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { {EVLPhi, Plan.getTripCount()}); VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); - auto *CanonicalIVIncrement = - cast(CanonicalIVPHI->getBackedgeValue()); VPSingleDefRecipe *OpVPEVL = VPEVL; if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); IVSize != 32) { OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, CanonicalIVPHI->getScalarType()); - OpVPEVL->insertBefore(CanonicalIVIncrement); + Latch->appendRecipe(OpVPEVL); } auto *NextEVLIV = - new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, - {CanonicalIVIncrement->hasNoUnsignedWrap(), - CanonicalIVIncrement->hasNoSignedWrap()}, - CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); - NextEVLIV->insertBefore(CanonicalIVIncrement); + new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, {false, false}, + CanonicalIVPHI->getDebugLoc(), "index.evl.next"); + Latch->appendRecipe(NextEVLIV); EVLPhi->addOperand(NextEVLIV); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { @@ -1468,9 +1426,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { recursivelyDeleteDeadRecipes(HeaderMask); } // Replace all uses of VPCanonicalIVPHIRecipe by - // VPEVLBasedIVPHIRecipe except for the canonical IV increment. + // VPEVLBasedIVPHIRecipe. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); - CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); // TODO: support unroll factor > 1. Plan.setUF(1); return true; @@ -1572,3 +1529,71 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } } } + +void VPlanTransforms::finalizePlan(VPlan &Plan, bool HasNUW, + bool DataAndControlFlowWithoutRuntimeCheck) { + auto *CanIV = Plan.getCanonicalIV(); + + VPBasicBlock *EB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPBuilder Builder(EB); + DebugLoc DL = CanIV->getDebugLoc(); + // Add a VPInstruction to increment the scalar canonical IV by VF * UF. + auto *CanonicalIVIncrement = + Builder.createOverflowingOp(Instruction::Add, {CanIV, &Plan.getVFxUF()}, + {HasNUW, false}, DL, "index.next"); + + CanIV->addOperand(CanonicalIVIncrement); + + auto FoundLaneMaskPhi = find_if( + Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &P) { return isa(P); }); + + if (FoundLaneMaskPhi == + Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis().end()) { + // Add the BranchOnCount VPInstruction to the latch. + Builder.createNaryOp(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, + DL); + return; + } + auto *LaneMaskPhi = cast(&*FoundLaneMaskPhi); + auto *VecPreheader = + cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); + Builder.setInsertPoint(VecPreheader); + + VPValue *TC = Plan.getTripCount(); + + // TODO: Check if dropping the flags is needed if + // !DataAndControlFlowWithoutRuntimeCheck. + CanonicalIVIncrement->dropPoisonGeneratingFlags(); + VPValue *TripCount, *IncrementValue; + if (!DataAndControlFlowWithoutRuntimeCheck) { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } else { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + IncrementValue = CanIV; + TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, + {TC}, DL); + } + // Create the active lane mask for the next iteration of the loop before the + // original terminator. + Builder.setInsertPoint(EB); + auto *InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL); + auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {InLoopIncrement, TripCount}, DL, + "active.lane.mask.next"); + LaneMaskPhi->addOperand(ALM); + + // Replace the original terminator with BranchOnCond. We have to invert the + // mask here because a true condition means jumping to the exit block. + auto *NotMask = Builder.createNot(ALM, DL); + Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 96b8a6639723c..88b7317a00262 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -106,6 +106,12 @@ struct VPlanTransforms { /// this transformation. /// \returns true if the transformation succeeds, or false if it doesn't. static bool tryAddExplicitVectorLength(VPlan &Plan); + + /// Finalize \p Plan by introducing recipes needed for code-gen, like + /// introducing explicit increments for the canonical induction and the branch + /// to exit the vector loop. + static void finalizePlan(VPlan &Plan, bool HasNUW, + bool DataAndControlFlowWithoutRuntimeCheck); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 56c82b13ccdef..fc32caed1e9c4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -30,6 +30,8 @@ class VPlanVerifier { SmallPtrSet WrappedIRBBs; + bool IsAbstract; + // Verify that phi-like recipes are at the beginning of \p VPBB, with no // other recipes in between. Also check that only header blocks contain // VPHeaderPHIRecipes. @@ -54,7 +56,8 @@ class VPlanVerifier { bool verifyRegionRec(const VPRegionBlock *Region); public: - VPlanVerifier(VPDominatorTree &VPDT) : VPDT(VPDT) {} + VPlanVerifier(VPDominatorTree &VPDT, bool IsAbstract) + : VPDT(VPDT), IsAbstract(IsAbstract) {} bool verify(const VPlan &Plan); }; @@ -185,7 +188,7 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { if (VPB->getNumSuccessors() > 1 || (VPBB && VPBB->getParent() && VPBB->isExiting() && !VPBB->getParent()->isReplicator())) { - if (!VPBB || !VPBB->getTerminator()) { + if (!IsAbstract && (!VPBB || !VPBB->getTerminator())) { errs() << "Block has multiple successors but doesn't " "have a proper branch recipe!\n"; return false; @@ -315,18 +318,20 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount or " - "BranchOnCond VPInstruction but is empty\n"; - return false; - } + if (!IsAbstract) { + if (Exiting->empty()) { + errs() << "VPlan vector loop exiting block must end with BranchOnCount or" + "BranchOnCond VPInstruction but is empty\n"; + return false; + } - auto *LastInst = dyn_cast(std::prev(Exiting->end())); - if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && - LastInst->getOpcode() != VPInstruction::BranchOnCond)) { - errs() << "VPlan vector loop exit must end with BranchOnCount or " - "BranchOnCond VPInstruction\n"; - return false; + auto *LastInst = dyn_cast(std::prev(Exiting->end())); + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; + return false; + } } for (const auto &KV : Plan.getLiveOuts()) @@ -338,9 +343,9 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return true; } -bool llvm::verifyVPlanIsValid(const VPlan &Plan) { +bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool IsAbstract) { VPDominatorTree VPDT; VPDT.recalculate(const_cast(Plan)); - VPlanVerifier Verifier(VPDT); + VPlanVerifier Verifier(VPDT, IsAbstract); return Verifier.verify(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h index 3ddc49fda36b8..24fa649e9a225 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h @@ -33,7 +33,10 @@ class VPlan; /// 2. all phi-like recipes must be at the beginning of a block, with no other /// recipes in between. Note that currently there is still an exception for /// VPBlendRecipes. -bool verifyVPlanIsValid(const VPlan &Plan); +/// If \p IsAbstract, consider the VPlan to check as abstract, which means some +/// aspects may not be finalized yet, for example, no terminators have been +/// added to exit loop regions. +bool verifyVPlanIsValid(const VPlan &Plan, bool IsAbstract = false); } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index c7447b6a8101e..fa3356970c9fa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -136,12 +136,12 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -709,11 +709,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -967,12 +967,12 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]] ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1206,12 +1206,12 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 726d98f4d37d3..64583ecef7c4a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -113,10 +113,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 @@ -288,10 +288,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 @@ -459,10 +459,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD1]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 6a7263d649853..aaba812028d82 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -53,12 +53,12 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index df107847e3e32..5b1bb79c310fc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -78,10 +78,10 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 0347527b46a29..6e17de8091f4d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -20,12 +20,12 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -148,12 +148,12 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -273,12 +273,12 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index a2902242f3591..c406dd7a492ff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -18,6 +18,19 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] @@ -140,11 +153,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 ; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] @@ -158,6 +166,26 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index a531d411153cc..836cd2d62c35b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -18,12 +18,12 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -79,10 +79,10 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[UMAX]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[UMAX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -143,12 +143,12 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -218,12 +218,12 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = mul [[TMP14]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) @@ -297,12 +297,12 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -374,12 +374,12 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -447,12 +447,12 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[SRC:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -544,12 +544,12 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -614,12 +614,12 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] @@ -692,12 +692,12 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 3bab341e1c248..55fbe0a1add58 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-LABEL: LV: Checking a loop in 'pointer_induction_used_as_vector' ; CHECK-NOT: LV: Found {{.*}} scalar instruction: %ptr.iv.2.next = getelementptr inbounds i8, ptr %ptr.iv.2, i64 1 ; -; CHECK: VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={vscale x 2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%N> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll index ad0d40ee7fbde..0f5ef00ce9baa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -10,8 +10,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-LABEL: LV: Checking a loop in 'test_v4_v4m' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -28,8 +26,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -39,8 +35,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: } ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -57,8 +51,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -73,8 +65,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4m' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -91,8 +81,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXST:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -102,8 +90,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: } ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -120,8 +106,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -135,8 +119,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -153,8 +135,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -164,8 +144,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: } ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -182,8 +160,6 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index def912a6d72cf..4711294f6f722 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -211,12 +211,12 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index d8f14f30295b6..73b1770c073d8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -12,10 +12,10 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -37,13 +37,16 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: entry: ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -98,10 +101,10 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -123,13 +126,16 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: entry: ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index f67b932b04a0d..1bb64442b8155 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -7,9 +7,6 @@ target triple = "arm64-apple-ios" ; CHECK-LABEL: LV: Checking a loop in 'test' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count - ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -26,8 +23,6 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -37,8 +32,6 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: } ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -55,8 +48,6 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b5aa96eb23f5e..6cab8aec9d0e1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -51,8 +51,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK: ir-bb: ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) @@ -73,8 +71,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -192,8 +188,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK: ir-bb: ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) @@ -214,8 +208,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 30a98db665cb3..03dce36a740df 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_1 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -102,7 +102,7 @@ exit: define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_2 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -175,7 +175,7 @@ exit: define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_3_reduction -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -253,7 +253,7 @@ exit: ; containing %conv at the end, because %conv is the last recipe in the block. define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -355,7 +355,7 @@ exit: ; Test case that requires sinking a recipe in a replicate region after another replicate region. define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize { ; CHECK-LABEL: sink_replicate_region_after_replicate_region -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -437,7 +437,7 @@ exit: ; preds = %loop define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) { ; CHECK-LABEL: need_new_block_after_sinking_pr56146 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll index b4ab6f7e8ceb7..0bba1779a9440 100644 --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,7 +36,7 @@ for.end: ; Check for crash exposed by D76992. ; CHECK-LABEL: 'test' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index d872fb187a3bc..91f481b48bcbb 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable { ; CHECK: digraph VPlan { -; CHECK-NEXT: graph [labelloc=t, fontsize=30; label="Vectorization Plan\nInitial VPlan for VF=\{4\},UF\>=1\nLive-in vp\<[[VFxUF:%.+]]\> = VF * UF\nLive-in vp\<[[VEC_TC:%.+]]\> = vector-trip-count\nLive-in ir\<%n\> = original trip-count\n"] +; CHECK-NEXT: graph [labelloc=t, fontsize=30; label="Vectorization Plan\nFinal VPlan for VF=\{4\},UF\>=1\nLive-in vp\<[[VFxUF:%.+]]\> = VF * UF\nLive-in vp\<[[VEC_TC:%.+]]\> = vector-trip-count\nLive-in ir\<%n\> = original trip-count\n"] ; CHECK-NEXT: node [shape=rect, fontname=Courier, fontsize=30] ; CHECK-NEXT: edge [fontname=Courier, fontsize=30] ; CHECK-NEXT: compound=true diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index 7ab2459ada2ed..9cf320ac60c4a 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -3,7 +3,7 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-LABEL: LV: Checking a loop in 'iv_no_binary_op_in_descriptor' -; CHECK: VPlan 'Initial VPlan for VF={8},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={8},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 4c93cb870fadb..19758d576b519 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable { ; CHECK-LABEL: Checking a loop in 'print_call_and_memory' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -58,7 +58,7 @@ for.end: ; preds = %for.body, %entry define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ptr %z) nounwind uwtable { ; CHECK-LABEL: Checking a loop in 'print_widen_gep_and_select' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -113,7 +113,7 @@ for.end: ; preds = %for.body, %entry define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-LABEL: Checking a loop in 'print_reduction' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -162,7 +162,7 @@ for.end: ; preds = %for.body, %entry define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr noalias %dst) { ; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -210,7 +210,7 @@ for.end: ; preds = %for.body, %entry define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -289,7 +289,7 @@ for.end: ; preds = %for.inc define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-LABEL: Checking a loop in 'print_interleave_groups' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<256> = original trip-count @@ -358,7 +358,7 @@ for.end: define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-LABEL: Checking a loop in 'print_fmuladd_strict' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -413,7 +413,7 @@ for.end: define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 { ; CHECK-LABEL: Checking a loop in 'debug_loc_vpinstruction' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<128> = original trip-count @@ -502,7 +502,7 @@ declare float @llvm.fmuladd.f32(float, float, float) define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-LABEL: Checking a loop in 'print_expand_scev' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -558,7 +558,7 @@ loop.exit: define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-LABEL: Checking a loop in 'print_exit_value' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count @@ -606,7 +606,7 @@ exit: define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %z) { ; CHECK-LABEL: Checking a loop in 'print_fast_math_flags' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -659,7 +659,7 @@ exit: define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-LABEL: Checking a loop in 'print_exact_flags' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -710,7 +710,7 @@ exit: define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-LABEL: Checking a loop in 'print_call_flags' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -792,7 +792,7 @@ end: ; FIXME: Preserve disjoint flag on OR recipe. define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-LABEL: Checking a loop in 'print_disjoint_flags' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -843,7 +843,7 @@ exit: define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-LABEL: LV: Checking a loop in 'zext_nneg' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll index 8c18c107b39fc..a2adf6eb21ab1 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Make sure recipes with side-effects are not sunk. define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-LABEL: sink_with_sideeffects -; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={1},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<0> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 41bb3ca8694fa..de75a6fe04658 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -79,7 +79,7 @@ exit: } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -163,7 +163,7 @@ exit: } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -249,7 +249,7 @@ exit: ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -320,7 +320,7 @@ exit: ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -417,7 +417,7 @@ exit: ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -523,7 +523,7 @@ exit: ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -629,7 +629,7 @@ exit: define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -734,7 +734,7 @@ exit: define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -801,7 +801,7 @@ exit: define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -887,7 +887,7 @@ exit: define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) { ; CHECK-LABEL: LV: Checking a loop in 'update_multiple_users' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<999> = original trip-count @@ -952,7 +952,7 @@ exit: define void @sinking_requires_duplication(ptr %addr) { ; CHECK-LABEL: LV: Checking a loop in 'sinking_requires_duplication' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<201> = original trip-count @@ -1022,7 +1022,7 @@ exit: ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -1089,7 +1089,7 @@ exit: define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-LABEL: LV: Checking a loop in 'ptr_induction_remove_dead_recipe' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll index 5ea27994b356d..16d083d871ad3 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll @@ -8,7 +8,7 @@ define void @test_unused_interleave(ptr %src, i32 %length) { ; CHECK-LABEL: Checking a loop in 'test_unused_interleave' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<%0> = VF * UF ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count ; CHECK-NEXT: Live-in ir<%length> = original trip-count From 311f105dacd14ced7e2578f8a144ecd738d4ff38 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 9 Oct 2024 22:14:25 +0100 Subject: [PATCH 2/6] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 31 +++++++-------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 38 +++++++++++-------- .../Transforms/Vectorize/VPlanTransforms.h | 9 ++--- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +- .../Transforms/Vectorize/VPlanVerifier.cpp | 35 ++++++++--------- llvm/lib/Transforms/Vectorize/VPlanVerifier.h | 5 +-- 7 files changed, 60 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e541c4625a266..9cdb5086e8486 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7550,14 +7550,14 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::unrollByUF(BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); - TailFoldingStyle Style = - CM.getTailFoldingStyle(!isIndvarOverflowCheckKnownFalse(&CM, BestVF)); + TailFoldingStyle Style = CM.getTailFoldingStyle( + !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF)); // When not folding the tail, we know that the induction increment will not // overflow. bool HasNUW = Style == TailFoldingStyle::None; bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; - VPlanTransforms::finalizePlan(BestVPlan, HasNUW, WithoutRuntimeCheck); + VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF @@ -8673,8 +8673,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) break; - assert(verifyVPlanIsValid(*Plan, /*IsAbstract*/ true) && - "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } VF = SubRange.End; @@ -8682,15 +8680,24 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } // Add the required canonical IV. -static void addCanonicalIV(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL) { +static void addCanonicalIV(VPlan &Plan, Type *IdxTy, DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddLiveIn(StartIdx); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + // TODO: Introduce a separate scalar phi recipe that can be used for codegen, + // turning VPCanonicalIVPHIRecipe into an 'abstract' recipe which cannot be + // executed directly. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + + // Add the BranchOnCount VPInstruction to the latch. + VPBuilder Builder(TopRegion->getExitingBasicBlock()); + // TODO: introduce branch-on-count during VPlan final (pre-codegen) lowering. + Builder.createNaryOp(VPInstruction::BranchOnCount, + {CanonicalIVPHI, &Plan.getVectorTripCount()}, DL); } // Collect VPIRInstructions for phis in the original exit block that are modeled @@ -8940,10 +8947,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); - // When not folding the tail, we know that the induction increment will not - // overflow. - bool HasNUW = Style == TailFoldingStyle::None; - addCanonicalIV(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + addCanonicalIV(*Plan, Legal->getWidestInductionType(), DL); VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); @@ -9176,11 +9180,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); Term->eraseFromParent(); - // Tail folding is not supported for outer loops, so the induction increment - // is guaranteed to not wrap. - bool HasNUW = true; - addCanonicalIV(*Plan, Legal->getWidestInductionType(), HasNUW, DebugLoc()); - assert(verifyVPlanIsValid(*Plan, /*IsAbstract*/ true) && "VPlan is invalid"); + addCanonicalIV(*Plan, Legal->getWidestInductionType(), DebugLoc()); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 7e3cd311f89ab..cfed02b1db438 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1019,7 +1019,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - assert(verifyVPlanIsValid(*this, /*IsAbstract*/ false) && "VPlan is invalid"); + assert(verifyVPlanIsValid(*this) && "VPlan is invalid"); // Initialize CFG state. State->CFG.PrevVPBB = nullptr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a671782dec463..245eecea32da8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1201,8 +1201,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { // %Negated = Not %ALM // branch-on-cond %Negated // -static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( - VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { +static VPActiveLaneMaskPHIRecipe *createActiveLaneMaskPhi(VPlan &Plan) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -1295,8 +1294,7 @@ void VPlanTransforms::addActiveLaneMask( cast(*FoundWidenCanonicalIVUser); VPSingleDefRecipe *LaneMask; if (UseActiveLaneMaskForControlFlow) { - LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( - Plan, DataAndControlFlowWithoutRuntimeCheck); + LaneMask = createActiveLaneMaskPhi(Plan); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, @@ -1427,17 +1425,18 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { VPEVL->insertAfter(AVL); VPSingleDefRecipe *OpVPEVL = VPEVL; + VPRecipeBase *LatchTerm = Latch->getTerminator(); if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); IVSize != 32) { OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, CanonicalIVPHI->getScalarType()); - Latch->appendRecipe(OpVPEVL); + OpVPEVL->insertBefore(LatchTerm); } auto *NextEVLIV = new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, {false, false}, CanonicalIVPHI->getDebugLoc(), "index.evl.next"); - Latch->appendRecipe(NextEVLIV); + NextEVLIV->insertBefore(LatchTerm); EVLPhi->addOperand(NextEVLIV); transformRecipestoEVLRecipes(Plan, *VPEVL); @@ -1622,12 +1621,13 @@ void VPlanTransforms::createInterleaveGroups( } } -void VPlanTransforms::finalizePlan(VPlan &Plan, bool HasNUW, - bool DataAndControlFlowWithoutRuntimeCheck) { +void VPlanTransforms::lowerCanonicalIV( + VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck) { auto *CanIV = Plan.getCanonicalIV(); VPBasicBlock *EB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); - VPBuilder Builder(EB); + auto *Term = EB->getTerminator(); + VPBuilder Builder(Term); DebugLoc DL = CanIV->getDebugLoc(); // Add a VPInstruction to increment the scalar canonical IV by VF * UF. auto *CanonicalIVIncrement = @@ -1642,12 +1642,15 @@ void VPlanTransforms::finalizePlan(VPlan &Plan, bool HasNUW, if (FoundLaneMaskPhi == Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis().end()) { - // Add the BranchOnCount VPInstruction to the latch. - Builder.createNaryOp(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, - DL); + + // Update BranchOnCount VPInstruction in the latch to use increment. + // TODO: Should have separate opcodes for separate semantics. + Term->setOperand(0, CanonicalIVIncrement); return; } + + // Now introduce a conditional branch to control the loop until the lane mask + // is exhuasted. auto *LaneMaskPhi = cast(&*FoundLaneMaskPhi); auto *VecPreheader = cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); @@ -1676,9 +1679,11 @@ void VPlanTransforms::finalizePlan(VPlan &Plan, bool HasNUW, // Create the active lane mask for the next iteration of the loop before the // original terminator. Builder.setInsertPoint(EB); - auto *InLoopIncrement = - Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, {false, false}, DL); + auto *InLoopIncrement = Plan.getUF() > 1 + ? Builder.createOverflowingOp( + VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL) + : IncrementValue; auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {InLoopIncrement, TripCount}, DL, "active.lane.mask.next"); @@ -1688,4 +1693,5 @@ void VPlanTransforms::finalizePlan(VPlan &Plan, bool HasNUW, // mask here because a true condition means jumping to the exit block. auto *NotMask = Builder.createNot(ALM, DL); Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); + Term->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3d083aedfa168..1a0e7e4c3471c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -122,11 +122,10 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); - /// Finalize \p Plan by introducing recipes needed for code-gen, like - /// introducing explicit increments for the canonical induction and the branch - /// to exit the vector loop. - static void finalizePlan(VPlan &Plan, bool HasNUW, - bool DataAndControlFlowWithoutRuntimeCheck); + /// Finalize \p Plan by introducing explicit increments for the canonical + /// induction. + static void lowerCanonicalIV(VPlan &Plan, bool HasNUW, + bool DataAndControlFlowWithoutRuntimeCheck); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4621c28b05129..e9ad4c4711b7b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -91,7 +91,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); // Canonical IV chain is uniform. - if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue()) + if (V == CanonicalIV) // || V == CanonicalIV->getBackedgeValue()) return true; return TypeSwitch(R) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index d51a6c809085a..99bc4c38a3c3c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -31,8 +31,6 @@ class VPlanVerifier { SmallPtrSet WrappedIRBBs; - bool IsAbstract; - // Verify that phi-like recipes are at the beginning of \p VPBB, with no // other recipes in between. Also check that only header blocks contain // VPHeaderPHIRecipes. @@ -62,8 +60,7 @@ class VPlanVerifier { bool verifyRegionRec(const VPRegionBlock *Region); public: - VPlanVerifier(VPDominatorTree &VPDT, bool IsAbstract) - : VPDT(VPDT), IsAbstract(IsAbstract) {} + VPlanVerifier(VPDominatorTree &VPDT) : VPDT(VPDT) {} bool verify(const VPlan &Plan); }; @@ -275,7 +272,7 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { if (VPB->getNumSuccessors() > 1 || (VPBB && VPBB->getParent() && VPBB->isExiting() && !VPBB->getParent()->isReplicator())) { - if (!IsAbstract && (!VPBB || !VPBB->getTerminator())) { + if (!VPBB || !VPBB->getTerminator()) { errs() << "Block has multiple successors but doesn't " "have a proper branch recipe!\n"; return false; @@ -405,20 +402,18 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - if (!IsAbstract) { - if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount or" - "BranchOnCond VPInstruction but is empty\n"; - return false; - } + if (Exiting->empty()) { + errs() << "VPlan vector loop exiting block must end with BranchOnCount or " + "BranchOnCond VPInstruction but is empty\n"; + return false; + } - auto *LastInst = dyn_cast(std::prev(Exiting->end())); - if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && - LastInst->getOpcode() != VPInstruction::BranchOnCond)) { - errs() << "VPlan vector loop exit must end with BranchOnCount or " - "BranchOnCond VPInstruction\n"; - return false; - } + auto *LastInst = dyn_cast(std::prev(Exiting->end())); + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; + return false; } for (const auto &KV : Plan.getLiveOuts()) @@ -430,9 +425,9 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return true; } -bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool IsAbstract) { +bool llvm::verifyVPlanIsValid(const VPlan &Plan) { VPDominatorTree VPDT; VPDT.recalculate(const_cast(Plan)); - VPlanVerifier Verifier(VPDT, IsAbstract); + VPlanVerifier Verifier(VPDT); return Verifier.verify(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h index 24fa649e9a225..3ddc49fda36b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h @@ -33,10 +33,7 @@ class VPlan; /// 2. all phi-like recipes must be at the beginning of a block, with no other /// recipes in between. Note that currently there is still an exception for /// VPBlendRecipes. -/// If \p IsAbstract, consider the VPlan to check as abstract, which means some -/// aspects may not be finalized yet, for example, no terminators have been -/// added to exit loop regions. -bool verifyVPlanIsValid(const VPlan &Plan, bool IsAbstract = false); +bool verifyVPlanIsValid(const VPlan &Plan); } // namespace llvm From 53b812d4f97265e8ce042c7abaa1e660ba5169f7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 10 Nov 2024 10:03:25 +0000 Subject: [PATCH 3/6] !fixup fix after merge, update vplan printing tests. --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 - .../Transforms/Vectorize/VPlanTransforms.cpp | 8 +- .../LoopVectorize/vplan-printing.ll | 124 +++++++----------- .../vplan-sink-scalars-and-merge.ll | 82 ++++-------- 5 files changed, 79 insertions(+), 143 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2865c0cd82506..89f1b179c6fb5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7639,9 +7639,6 @@ DenseMap LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. - VPlanTransforms::unrollByUF(BestVPlan, BestUF, - OrigLoop->getHeader()->getContext()); - TailFoldingStyle Style = CM.getTailFoldingStyle( !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF)); // When not folding the tail, we know that the induction increment will not @@ -7650,6 +7647,8 @@ DenseMap LoopVectorizationPlanner::executePlan( bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); + VPlanTransforms::unrollByUF(BestVPlan, BestUF, + OrigLoop->getHeader()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF @@ -8770,6 +8769,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( *Plan, CM.getMaxSafeElements())) break; + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } VF = SubRange.End; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f0feea22787be..00ab2c1207779 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1005,8 +1005,6 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - assert(verifyVPlanIsValid(*this) && "VPlan is invalid"); - // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3ad1730fcf391..576e92bf15432 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1809,11 +1809,9 @@ void VPlanTransforms::lowerCanonicalIV( // Create the active lane mask for the next iteration of the loop before the // original terminator. Builder.setInsertPoint(EB); - auto *InLoopIncrement = Plan.getUF() > 1 - ? Builder.createOverflowingOp( - VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, {false, false}, DL) - : IncrementValue; + auto *InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {InLoopIncrement, TripCount}, DL, "active.lane.mask.next"); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index be0cbca7dc108..7c9d4003b0576 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -8,8 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable { ; CHECK-LABEL: Checking a loop in 'print_call_and_memory' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -18,7 +17,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> @@ -27,8 +26,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -71,9 +69,8 @@ for.end: ; preds = %for.body, %entry define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ptr %z) nounwind uwtable { ; CHECK-LABEL: Checking a loop in 'print_widen_gep_and_select' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -82,7 +79,7 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0, ir<1>, vp<[[VF]] ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%y>, ir<%iv> @@ -94,8 +91,7 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -140,8 +136,7 @@ for.end: ; preds = %for.body, %entry define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-LABEL: Checking a loop in 'print_reduction' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -150,15 +145,14 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -203,8 +197,7 @@ for.end: ; preds = %for.body, %entry define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr noalias %dst) { ; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -213,15 +206,14 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> ; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -267,9 +259,8 @@ for.end: ; preds = %for.body, %entry define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -282,7 +273,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%i>, ir<5> @@ -308,8 +299,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -360,8 +350,7 @@ for.end: ; preds = %for.inc define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-LABEL: Checking a loop in 'print_interleave_groups' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<256> = original trip-count ; CHECK-EMPTY: @@ -370,7 +359,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * ir<4> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<4> ; CHECK-NEXT: CLONE ir<%gep.AB.0> = getelementptr inbounds ir<@AB>, ir<0>, vp<[[STEPS]]> @@ -385,8 +374,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-NEXT: store ir<1> to index 1 ; CHECK-NEXT: store ir<2> to index 2 ; CHECK-NEXT: store ir<%AB.3> to index 3 -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -441,8 +429,7 @@ for.end: define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-LABEL: Checking a loop in 'print_fmuladd_strict' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -451,7 +438,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> @@ -462,8 +449,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: WIDEN ir<%l.b> = load vp<[[VEC_PTR2]]> ; CHECK-NEXT: EMIT vp<[[FMUL:%.+]]> = fmul nnan ninf nsz ir<%l.a>, ir<%l.b> ; CHECK-NEXT: REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -511,8 +497,7 @@ for.end: define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 { ; CHECK-LABEL: Checking a loop in 'debug_loc_vpinstruction' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<128> = original trip-count ; CHECK-EMPTY: @@ -521,7 +506,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%asd>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd> @@ -555,8 +540,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT:} ; CHECK-NEXT: Successor(s): middle.block @@ -613,9 +597,8 @@ declare float @llvm.fmuladd.f32(float, float, float) define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-LABEL: Checking a loop in 'print_expand_scev' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -631,7 +614,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-INDUCTION\l" + ; CHECK-NEXT: " %iv = phi %iv.next, 0\l" + ; CHECK-NEXT: " ir<%v2>, vp<[[EXP_SCEV]]>, vp<[[VF]]> @@ -640,8 +623,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: WIDEN ir<%v3> = add nuw ir<%v2>, ir<1> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%v3>, ir<%gep> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -685,9 +667,8 @@ loop.exit: define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-LABEL: Checking a loop in 'print_exit_value' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count ; CHECK-EMPTY: @@ -696,15 +677,14 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -747,8 +727,7 @@ exit: define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %z) { ; CHECK-LABEL: Checking a loop in 'print_fast_math_flags' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -757,7 +736,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.y> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.y> @@ -768,8 +747,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%div> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -813,8 +791,7 @@ exit: define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-LABEL: Checking a loop in 'print_exact_flags' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -823,7 +800,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> @@ -833,8 +810,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%div.1>, ir<%div.2> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -877,8 +853,7 @@ exit: define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-LABEL: Checking a loop in 'print_call_flags' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -887,7 +862,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%ld.addr> @@ -918,8 +893,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -972,8 +946,7 @@ end: ; FIXME: Preserve disjoint flag on OR recipe. define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-LABEL: Checking a loop in 'print_disjoint_flags' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: @@ -982,7 +955,7 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> @@ -992,8 +965,7 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%or.1>, ir<%or.2> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -1036,8 +1008,7 @@ exit: define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-LABEL: LV: Checking a loop in 'zext_nneg' -; CHECK: VPlan 'Final VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count ; CHECK-EMPTY: @@ -1046,15 +1017,14 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%p>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> ; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -1078,7 +1048,6 @@ exit: define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-LABEL: 'print_first_order_recurrence_and_result' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count ; CHECK-EMPTY: @@ -1097,8 +1066,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: WIDEN ir<%add> = add vp<[[FOR1_SPLICE]]>, ir<1> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.ptr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index ee307f219a654..63214a4f80b63 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,9 +10,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -51,8 +50,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-NEXT: } ; CHECK: loop.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -79,9 +77,8 @@ exit: } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -136,8 +133,7 @@ exit: ; CHECK-NEXT: } ; CHECK: loop.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -164,9 +160,8 @@ exit: } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -221,8 +216,7 @@ exit: ; CHECK-NEXT: } ; CHECK: loop.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -251,9 +245,8 @@ exit: ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: Live-in ir<11> = original trip-count @@ -292,8 +285,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: Successor(s): loop.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.then.0: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -323,9 +315,8 @@ exit: ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -384,8 +375,7 @@ define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): next.0.1 ; CHECK-EMPTY: ; CHECK-NEXT: next.0.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -421,9 +411,8 @@ exit: ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -484,8 +473,7 @@ define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.1.1 ; CHECK-EMPTY: ; CHECK-NEXT: then.1.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -528,9 +516,8 @@ exit: ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -592,8 +579,7 @@ define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.1.2 ; CHECK-EMPTY: ; CHECK-NEXT: then.1.2: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -635,9 +621,8 @@ exit: define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -701,8 +686,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.4 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.4: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -741,9 +725,8 @@ exit: define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -781,8 +764,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK-NEXT: loop.2: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -807,9 +789,8 @@ exit: define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count @@ -866,8 +847,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK-NEXT: loop.2: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -893,8 +873,7 @@ exit: define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) { ; CHECK-LABEL: LV: Checking a loop in 'update_multiple_users' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<999> = original trip-count ; CHECK-EMPTY: @@ -925,8 +904,7 @@ define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) { ; CHECK-NEXT: Successor(s): loop.then.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.then.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -957,8 +935,7 @@ exit: define void @sinking_requires_duplication(ptr %addr) { ; CHECK-LABEL: LV: Checking a loop in 'sinking_requires_duplication' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<201> = original trip-count ; CHECK-EMPTY: @@ -992,8 +969,7 @@ define void @sinking_requires_duplication(ptr %addr) { ; CHECK-NEXT: Successor(s): then.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -1027,8 +1003,7 @@ exit: ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: Live-in ir<%n> = original trip-count @@ -1063,8 +1038,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n ; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -1113,7 +1087,6 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-LABEL: LV: Checking a loop in 'ptr_induction_remove_dead_recipe' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -1153,8 +1126,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block From 4afb1f0f2210a7b97d05b49c07d3290d0ccd683e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 9 Dec 2024 11:36:46 +0000 Subject: [PATCH 4/6] !fixup update to latest main --- .../Transforms/Vectorize/LoopVectorize.cpp | 40 ++++++++----------- .../AArch64/divs-with-scalable-vfs.ll | 4 +- .../AArch64/fully-unrolled-cost.ll | 12 +++--- .../AArch64/induction-costs-sve.ll | 14 +++---- .../AArch64/sve-tail-folding-reductions.ll | 6 +-- .../AArch64/sve2-histcnt-vplan.ll | 12 ++---- .../LoopVectorize/AArch64/sve2-histcnt.ll | 2 +- .../AArch64/uniform-args-call-variants.ll | 30 +++++++------- .../PowerPC/vplan-force-tail-with-evl.ll | 12 ++---- .../LoopVectorize/X86/vplan-vp-intrinsics.ll | 8 +--- .../first-order-recurrence-chains-vplan.ll | 20 +++------- .../LoopVectorize/vplan-iv-transforms.ll | 1 - 12 files changed, 67 insertions(+), 94 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 52e44f141c0e2..ab57b732c4ca0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7674,13 +7674,16 @@ DenseMap LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. - TailFoldingStyle Style = CM.getTailFoldingStyle( - !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF)); - // When not folding the tail, we know that the induction increment will not - // overflow. - bool HasNUW = Style == TailFoldingStyle::None; + bool IVUpdateMayOverflow = + !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF); + TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + // Use NUW for the induction increment if we proved that it won't overflow in + // the vector loop or when not folding the tail. In the later case, we know + // that the canonical induction increment will not overflow as the vector trip + // count is >= increment and a multiple of the increment. + bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); VPlanTransforms::unrollByUF(BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); @@ -9099,26 +9102,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { PSE, RequiresScalarEpilogueCheck, CM.foldTailByMasking(), OrigLoop); - // Don't use getDecisionAndClampRange here, because we don't know the UF - // so this function is better to be conservative, rather than to split - // it up into different VPlans. - // TODO: Consider using getDecisionAndClampRange here to split up VPlans. - bool IVUpdateMayOverflow = false; - for (ElementCount VF : Range) - IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); - DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); -///<<<<<<< HEAD addCanonicalIV(*Plan, Legal->getWidestInductionType(), DL); -/*=======*/ - /*// Use NUW for the induction increment if we proved that it won't overflow in*/ - /*// the vector loop or when not folding the tail. In the later case, we know*/ - /*// that the canonical induction increment will not overflow as the vector trip*/ - /*// count is >= increment and a multiple of the increment.*/ - /*bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;*/ - /*addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);*/ -/*>>>>>>> origin/main*/ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); @@ -9314,6 +9299,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) return nullptr; + // Don't use getDecisionAndClampRange here, because we don't know the UF + // so this function is better to be conservative, rather than to split + // it up into different VPlans. + // TODO: Consider using getDecisionAndClampRange here to split up VPlans. + bool IVUpdateMayOverflow = false; + for (ElementCount VF : Range) + IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); + + TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index 5e7b4ca9d300b..e4c8eabec0265 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -118,12 +118,12 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = mul [[TMP16]], splat (i64 1) @@ -245,12 +245,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP0]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = mul [[TMP16]], splat (i64 1) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index ab29bf8d2d52a..8c19d624f47c4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -10,12 +10,12 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 8: 26 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 16: 50 ; CHECK: LV: Selecting VF: vscale x 2 entry: @@ -46,12 +46,12 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 8: 26 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 16: 50 ; CHECK: LV: Selecting VF: vscale x 2 entry: @@ -84,14 +84,14 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 8: 27 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0> ; CHECK: Cost for VF 16: 51 ; CHECK: LV: Selecting VF: 16 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 4bb67c890f3cf..ddbc5b0eded79 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -112,15 +112,15 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP16:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 ; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]] ; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]] ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP16:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -280,10 +280,10 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 ; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -454,10 +454,10 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -651,10 +651,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) ; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 91eb0d5ba7e54..a52e7f54e4be3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -75,12 +75,12 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -202,12 +202,12 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -341,12 +341,12 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll index 9739611a8b6e4..167b0bc076a12 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll @@ -12,7 +12,6 @@ target triple = "aarch64-unknown-linux-gnu" ;; Check that the scalar plan contains the original instructions. ; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { -; CHECK-NEXT: Live-in [[VFxUF:.*]] = VF * UF ; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count ; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count ; CHECK-EMPTY: @@ -21,7 +20,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0>, [[IV_NEXT:.*]] +; CHECK-NEXT: EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1> ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: CLONE [[IDX:.*]] = load [[GEP_IDX]] @@ -30,8 +29,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE [[HISTVAL:.*]] = load [[GEP_BUCKET]] ; CHECK-NEXT: CLONE [[UPDATE:.*]] = add nsw [[HISTVAL]], ir<1> ; CHECK-NEXT: CLONE store [[UPDATE]], [[GEP_BUCKET]] -; CHECK-NEXT: EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]] -; CHECK-NEXT: EMIT branch-on-count [[IV_NEXT]], [[VTC]] +; CHECK-NEXT: EMIT branch-on-count [[IV]], [[VTC]] ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -55,7 +53,6 @@ target triple = "aarch64-unknown-linux-gnu" ;; Check that the vectorized plan contains a histogram recipe instead. ; CHECK: VPlan 'Initial VPlan for VF={vscale x 2,vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in [[VFxUF:.*]] = VF * UF ; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count ; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count ; CHECK-EMPTY: @@ -64,7 +61,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0>, [[IV_NEXT:.*]] +; CHECK-NEXT: EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1> ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]] @@ -72,8 +69,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 ; CHECK-NEXT: WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1> -; CHECK-NEXT: EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]] -; CHECK-NEXT: EMIT branch-on-count [[IV_NEXT]], [[VTC]] +; CHECK-NEXT: EMIT branch-on-count [[IV]], [[VTC]] ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 3b00312959d8a..a3ff58491cdb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -685,10 +685,10 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index c799cd35540fa..52a032ee27c83 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -38,12 +38,12 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP3]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -67,8 +67,8 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 1 ; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP6]]) ; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: @@ -124,12 +124,12 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP3]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -153,8 +153,8 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 1 ; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP6]]) ; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: @@ -200,9 +200,9 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE-LABEL: define void @test_uniform_not_invariant ; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; INTERLEAVE-NEXT: entry: -; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ne i64 [[N]], 0 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ugt i64 [[N]], 1 +; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll index a0696b3204dbd..272ca0b9b81ba 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll @@ -9,7 +9,6 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: Live-in ir<%N> = original trip-count @@ -19,7 +18,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.*]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.store @@ -46,8 +45,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-NEXT: Successor(s): for.body.2 ; CHECK-EMPTY: ; CHECK-NEXT: for.body.2: -; CHECK-NEXT: EMIT vp<[[CAN_INC:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -73,7 +71,6 @@ for.cond.cleanup: define void @safe_dep(ptr %p) { ; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<512> = original trip-count ; CHECK-EMPTY: @@ -82,7 +79,7 @@ define void @safe_dep(ptr %p) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%a1> = getelementptr ir<%p>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VPTR1:%.+]]> = vector-pointer ir<%a1> @@ -91,8 +88,7 @@ define void @safe_dep(ptr %p) { ; CHECK-NEXT: CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset> ; CHECK-NEXT: vp<[[VPTR2:%.+]]> = vector-pointer ir<%a2> ; CHECK-NEXT: WIDEN store vp<[[VPTR2]]>, ir<%v> -; CHECK-NEXT: EMIT vp<[[CAN_INC]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll index 1af03e740ef1a..f348f735d4bb3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll @@ -12,7 +12,6 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count ; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count ; IF-EVL-NEXT: Live-in ir<%N> = original trip-count @@ -36,13 +35,11 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV]]>, vp<[[VTC]]> ; IF-EVL-NEXT: No successors ; IF-EVL-NEXT: } ; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { -; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF ; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count ; NO-VP-NEXT: Live-in ir<%N> = original trip-count ; NO-VP-EMPTY: @@ -63,8 +60,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; NO-VP-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; NO-VP-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]> -; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> -; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV]]>, vp<[[VTC]]> ; NO-VP-NEXT: No successors ; NO-VP-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index d2d2063fd9058..8bf4854781423 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -5,7 +5,6 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_1' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count ; CHECK-EMPTY: @@ -26,8 +25,7 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: WIDEN ir<%add> = add vp<[[FOR1_SPLICE]]>, vp<[[FOR2_SPLICE]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.ptr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -77,7 +75,6 @@ exit: define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_3' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1000> = original trip-count ; CHECK-EMPTY: @@ -101,8 +98,7 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: WIDEN ir<%add.2> = add ir<%add.1>, vp<[[FOR3_SPLICE]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.ptr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add.2> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -161,7 +157,6 @@ exit: define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_4' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<4098> = original trip-count ; CHECK-EMPTY: @@ -171,7 +166,7 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next> ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev> ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> @@ -182,8 +177,7 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -233,7 +227,6 @@ ret: define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_5_hoist_to_load' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<4098> = original trip-count ; CHECK-EMPTY: @@ -242,7 +235,7 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next> ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev> ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> @@ -256,8 +249,7 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index befc53dcff887..bea41a3a87ca5 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -20,7 +20,6 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%dst>, vp<[[STEPS:%.+]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%iv> -; CHECK-NEXT: EMIT vp<[[CAN_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } From e1cd5d94dcea05952491596ccda0c845c5e0cc5c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 13 Dec 2024 11:10:56 +0000 Subject: [PATCH 5/6] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 13 ++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 78 ++++++++++--------- .../Transforms/Vectorize/VPlanTransforms.h | 7 +- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 5 +- 4 files changed, 57 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ccd753900f9b3..116e866cd5987 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7763,19 +7763,19 @@ DenseMap LoopVectorizationPlanner::executePlan( (!VectorizingEpilogue && !ExpandedSCEVs)) && "expanded SCEVs to reuse can only be used during epilogue vectorization"); - // TODO: Move to VPlan transform stage once the transition to the VPlan-based - // cost model is complete for better cost estimates. bool IVUpdateMayOverflow = !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF); TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; // Use NUW for the induction increment if we proved that it won't overflow in - // the vector loop or when not folding the tail. In the later case, we know + // the vector loop or when not folding the tail. In the latter case, we know // that the canonical induction increment will not overflow as the vector trip // count is >= increment and a multiple of the increment. bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; - VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); + // TODO: Move transforms to VPlan transform stage once the transition to the + // VPlan-based cost model is complete for better cost estimates. + VPlanTransforms::convertCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); VPlanTransforms::unrollByUF(BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); @@ -8905,7 +8905,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -// Add the required canonical IV. +// Add the required canonical IV along with its loop branch, but w/o its +// increment - which is introduced later. static void addCanonicalIV(VPlan &Plan, Type *IdxTy, DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddLiveIn(StartIdx); @@ -10099,7 +10100,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, isa(U) || isa(U) || cast(U)->getOpcode() == - Instruction::Add; + VPInstruction::BranchOnCount; }) && "the canonical IV should only be used by its increment or " "ScalarIVSteps when resetting the start value"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0dd8942e0c3ed..cead52bb8f127 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1257,41 +1257,26 @@ void VPlanTransforms::optimize(VPlan &Plan) { licm(Plan); } -// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace -// the loop terminator with a branch-on-cond recipe with the negated -// active-lane-mask as operand. Note that this turns the loop into an -// uncountable one. Only the existing terminator is replaced, all other existing -// recipes/users remain unchanged, except for poison-generating flags being -// dropped from the canonical IV increment. Return the created +// Add a VPActiveLaneMaskPHIRecipe and recipes to compute its start lane mask to +// \p Plan, but w/o its increment in the loop region or adjusting the exit +// conditions of the loop region - those are adjusted later. Return the created // VPActiveLaneMaskPHIRecipe. // // The function uses the following definitions: // -// %TripCount = DataWithControlFlowWithoutRuntimeCheck ? -// calculate-trip-count-minus-VF (original TC) : original TC -// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? -// CanonicalIVPhi : CanonicalIVIncrement // %StartV is the canonical induction start value. // // The function adds the following recipes: // // vector.ph: -// %TripCount = calculate-trip-count-minus-VF (original TC) -// [if DataWithControlFlowWithoutRuntimeCheck] // %EntryInc = canonical-iv-increment-for-part %StartV -// %EntryALM = active-lane-mask %EntryInc, %TripCount +// %EntryALM = active-lane-mask %EntryInc, original trip count // // vector.body: // ... -// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] -// ... -// %InLoopInc = canonical-iv-increment-for-part %IncrementValue -// %ALM = active-lane-mask %InLoopInc, TripCount -// %Negated = Not %ALM -// branch-on-cond %Negated +// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ] // static VPActiveLaneMaskPHIRecipe *createActiveLaneMaskPhi(VPlan &Plan) { - VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -1600,25 +1585,27 @@ bool VPlanTransforms::tryAddExplicitVectorLength( DebugLoc()); VPSingleDefRecipe *OpVPEVL = VPEVL; - VPRecipeBase *LatchTerm = Latch->getTerminator(); + VPRecipeBase *LatchBranch = Latch->getTerminator(); if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); IVSize != 32) { OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, CanonicalIVPHI->getScalarType()); - OpVPEVL->insertBefore(LatchTerm); + OpVPEVL->insertBefore(LatchBranch); } + // TODO: Set flags when introducing the increment here. auto *NextEVLIV = new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, {false, false}, CanonicalIVPHI->getDebugLoc(), "index.evl.next"); - NextEVLIV->insertBefore(LatchTerm); + NextEVLIV->insertBefore(LatchBranch); EVLPhi->addOperand(NextEVLIV); transformRecipestoEVLRecipes(Plan, *VPEVL); - // Replace all uses of VPCanonicalIVPHIRecipe by - // VPEVLBasedIVPHIRecipe. - CanonicalIVPHI->replaceAllUsesWith(EVLPhi); + // Replace all uses of VPCanonicalIVPHIRecipe by VPEVLBasedIVPHIRecipe except + // LatchBranch. + CanonicalIVPHI->replaceUsesWithIf( + EVLPhi, [LatchBranch](VPUser &U, unsigned) { return &U != LatchBranch; }); // TODO: support unroll factor > 1. Plan.setUF(1); return true; @@ -1797,19 +1784,18 @@ void VPlanTransforms::createInterleaveGroups( } } -void VPlanTransforms::lowerCanonicalIV( +void VPlanTransforms::convertCanonicalIV( VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck) { auto *CanIV = Plan.getCanonicalIV(); - VPBasicBlock *EB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); - auto *Term = EB->getTerminator(); - VPBuilder Builder(Term); + VPBasicBlock *Latch = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + auto *LatchTerm = Latch->getTerminator(); + VPBuilder Builder(LatchTerm); DebugLoc DL = CanIV->getDebugLoc(); // Add a VPInstruction to increment the scalar canonical IV by VF * UF. auto *CanonicalIVIncrement = Builder.createOverflowingOp(Instruction::Add, {CanIV, &Plan.getVFxUF()}, {HasNUW, false}, DL, "index.next"); - CanIV->addOperand(CanonicalIVIncrement); auto FoundLaneMaskPhi = find_if( @@ -1821,12 +1807,32 @@ void VPlanTransforms::lowerCanonicalIV( // Update BranchOnCount VPInstruction in the latch to use increment. // TODO: Should have separate opcodes for separate semantics. - Term->setOperand(0, CanonicalIVIncrement); + LatchTerm->setOperand(0, CanonicalIVIncrement); return; } - // Now introduce a conditional branch to control the loop until the lane mask - // is exhuasted. + // Now adjust the start value of the active-lane-mask depending on + // DataAndControlFlowWithoutRuntimeCheck, introduce its increment and a + // conditional branch to control the loop until the lane mask is exhausted. + // Concretely, we add the following recipes: + // + // vector.ph: + // %TripCount = calculate-trip-count-minus-VF (original TC) + // [if DataWithControlFlowWithoutRuntimeCheck] + // %EntryInc = canonical-iv-increment-for-part %StartV + // %EntryALM = active-lane-mask %EntryInc, %TripCount (replaces the + // existing start value for the existing active-lane-mask-phi) + // + // vector.body: + // ... + // (existing) %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], + // (added increment) [ %ALM, %vector.body ] + // ... + // %InLoopInc = canonical-iv-increment-for-part %IncrementValue + // %ALM = active-lane-mask %InLoopInc, TripCount + // %Negated = Not %ALM + // branch-on-cond %Negated (replaces existing BranchOnCount) + auto *LaneMaskPhi = cast(&*FoundLaneMaskPhi); auto *VecPreheader = cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); @@ -1854,7 +1860,7 @@ void VPlanTransforms::lowerCanonicalIV( } // Create the active lane mask for the next iteration of the loop before the // original terminator. - Builder.setInsertPoint(EB); + Builder.setInsertPoint(Latch); auto *InLoopIncrement = Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); @@ -1867,7 +1873,7 @@ void VPlanTransforms::lowerCanonicalIV( // mask here because a true condition means jumping to the exit block. auto *NotMask = Builder.createNot(ALM, DL); Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); - Term->eraseFromParent(); + LatchTerm->eraseFromParent(); } void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 9685625c29988..54901fd137973 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -135,8 +135,11 @@ struct VPlanTransforms { BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder); - /// Finalize \p Plan by introducing explicit increments for the canonical - /// induction. + /// Complete the canonical IV of \p Plan by introducing an explicit increment + /// between its header phi and latch branch on count. If there is an + /// active-lane-mask PHI, adjust the start value depending on \p + /// DataAndControlFlowWithoutRuntimeCheck, add its increment and adjust the + /// loop region's terminator to BranchOnCond based on the active-lane-mask. static void convertCanonicalIV(VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index ac4560d377fcd..5049715ac99a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -90,8 +90,9 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { } auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); - // Canonical IV chain is uniform. - if (V == CanonicalIV) // || V == CanonicalIV->getBackedgeValue()) + // Canonical IV chain is uniform and so is its backedge value if it exists. + if (V == CanonicalIV || (CanonicalIV->getNumOperands() == 2 && + V == CanonicalIV->getBackedgeValue())) return true; return TypeSwitch(R) From 62e78e99a624ed4fa07fd4dd882844d7e38cfb63 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 21 Jan 2025 13:58:11 +0000 Subject: [PATCH 6/6] !fixup update after recent merge --- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 +- .../AArch64/sve-tail-folding-unroll.ll | 36 +++---------------- .../LoopVectorize/AArch64/sve-tail-folding.ll | 10 +++++- .../LoopVectorize/AArch64/sve-widen-gep.ll | 2 +- .../LoopVectorize/AArch64/vplan-printing.ll | 8 ++--- .../widen-call-with-intrinsic-or-libfunc.ll | 4 +++ .../RISCV/riscv-vector-reverse.ll | 8 ++--- .../truncate-to-minimal-bitwidth-evl-crash.ll | 2 +- .../LoopVectorize/vplan-iv-transforms.ll | 2 +- .../vplan-sink-scalars-and-merge-vf1.ll | 6 ++-- 10 files changed, 28 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index dc36e4f902305..1c743276d9f17 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1792,7 +1792,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength( IVSize != 32) { OpVPEVL = new VPScalarCastRecipe( IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, - CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc()); + CanonicalIVPHI->getScalarType(), CanonicalIVPHI->getDebugLoc()); OpVPEVL->insertBefore(LatchBranch); } // TODO: Set flags when introducing the increment here. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index d23d544961912..ec45df0f91ec8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -36,19 +36,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -148,21 +135,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 ; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] @@ -226,10 +198,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12 ; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP24]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP24]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP24]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP24]]) ; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 ; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index c2e109f88d548..cd9c368129ab0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -18,6 +18,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] @@ -142,6 +143,7 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] @@ -216,12 +218,12 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul [[TMP13]], splat (i64 4) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP15]] @@ -292,6 +294,7 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] @@ -368,6 +371,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] @@ -440,6 +444,7 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] @@ -535,6 +540,7 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] @@ -604,6 +610,7 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] @@ -681,6 +688,7 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index c6dc1ee93c517..603bd98e81497 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-LABEL: LV: Checking a loop in 'pointer_induction_used_as_vector' ; CHECK-NOT: LV: Found {{.*}} scalar instruction: %ptr.iv.2.next = getelementptr inbounds i8, ptr %ptr.iv.2, i64 1 ; -; CHECK: VPlan 'Final VPlan for VF={vscale x 2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<%N> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index ccf8540b4ebf7..8715d03bd1f58 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -8,7 +8,6 @@ target triple = "aarch64-none-unknown-elf" define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: @@ -20,7 +19,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> @@ -33,8 +32,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -42,7 +40,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> ; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<%1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index 98540f752a767..5786feef21337 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -7,6 +7,7 @@ target triple = "arm64-apple-ios" ; CHECK-LABEL: LV: Checking a loop in 'test' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -26,6 +27,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN-CALL ir<%s> = call reassoc nnan ninf nsz arcp contract afn @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -49,6 +51,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: } ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -68,6 +71,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: WIDEN-INTRINSIC ir<%s> = call reassoc nnan ninf nsz arcp contract afn llvm.sin(ir<%conv>) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index f630f4f21e065..e1d4d2b0d6015 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -54,7 +54,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -82,8 +81,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block @@ -303,7 +301,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -331,8 +328,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll index 68b36f23de4b0..fd334ffbcf7d5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll @@ -33,7 +33,7 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) { ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vp.trunc.nxv1i8.nxv1i16( [[VP_OP1]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv1i8.nxv1p0( [[TMP8]], align 1 zeroinitializer, splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index 94d3edad56e67..9f9db85be6bd2 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -3,7 +3,7 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-LABEL: LV: Checking a loop in 'iv_no_binary_op_in_descriptor' -; CHECK: VPlan 'Final VPlan for VF={8},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={8},UF=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll index 3b068947fd863..53df1277c71d7 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -7,8 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Make sure recipes with side-effects are not sunk. define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-LABEL: sink_with_sideeffects -; CHECK: VPlan 'Final VPlan for VF={1},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<1024> = original trip-count ; CHECK-EMPTY: @@ -42,8 +41,7 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-NEXT: } ; CHECK: if.then.0: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block