diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c03c278fcebe7..61fc02cdf9b8b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -161,6 +161,15 @@ class VPBuilder { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); } + + VPInstruction *createFPOp(unsigned Opcode, + std::initializer_list Operands, + DebugLoc DL = {}, const Twine &Name = "", + FastMathFlags FMFs = {}) { + auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name); + return tryInsertInstruction(Op); + } + VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 37b8023e1fcf2..0d5318f4d2775 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -552,8 +552,7 @@ class InnerLoopVectorizer { /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. void scalarizeInstruction(const Instruction *Instr, - VPReplicateRecipe *RepRecipe, - const VPIteration &Instance, + VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State); /// Try to vectorize interleaved access group \p Group with the base address @@ -2451,7 +2450,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. - SmallVector AddrParts; unsigned Index = Group->getIndex(Instr); // TODO: extend the masked interleaved-group support to reversed access. @@ -2474,40 +2472,37 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } else Idx = Builder.getInt32(-Index); - for (unsigned Part = 0; Part < UF; Part++) { - Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - if (auto *I = dyn_cast(AddrPart)) - State.setDebugLocFrom(I->getDebugLoc()); + Value *AddrV = State.get(Addr, VPLane(0)); + if (auto *I = dyn_cast(AddrV)) + State.setDebugLocFrom(I->getDebugLoc()); - // Notice current instruction could be any index. Need to adjust the address - // to the member of index 0. - // - // E.g. a = A[i+1]; // Member of index 1 (Current instruction) - // b = A[i]; // Member of index 0 - // Current pointer is pointed to A[i+1], adjust it to A[i]. - // - // E.g. A[i+1] = a; // Member of index 1 - // A[i] = b; // Member of index 0 - // A[i+2] = c; // Member of index 2 (Current instruction) - // Current pointer is pointed to A[i+2], adjust it to A[i]. + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. - bool InBounds = false; - if (auto *gep = dyn_cast(AddrPart->stripPointerCasts())) - InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); - AddrParts.push_back(AddrPart); - } + bool InBounds = false; + if (auto *gep = dyn_cast(AddrV->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrV = Builder.CreateGEP(ScalarTy, AddrV, Idx, "", InBounds); State.setDebugLocFrom(Instr->getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); - auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( - unsigned Part, Value *MaskForGaps) -> Value * { + auto CreateGroupMask = [this, &BlockInMask, &State, + &InterleaveFactor](Value *MaskForGaps) -> Value * { if (VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); - auto *BlockInMaskPart = State.get(BlockInMask, Part); + auto *BlockInMaskPart = State.get(BlockInMask); SmallVector Ops = {BlockInMaskPart, BlockInMaskPart}; auto *MaskTy = VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); @@ -2518,7 +2513,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!BlockInMask) return MaskForGaps; - Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *BlockInMaskPart = State.get(BlockInMask); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), @@ -2538,54 +2533,47 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // For each unroll part, create a wide load for the group. - SmallVector NewLoads; - for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoad; - if (BlockInMask || MaskForGaps) { - assert(useMaskedInterleavedAccesses(*TTI) && - "masked interleaved groups are not allowed."); - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewLoad = - Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), - GroupMask, PoisonVec, "wide.masked.vec"); - } - else - NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlign(), "wide.vec"); - Group->addMetadata(NewLoad); - NewLoads.push_back(NewLoad); - } + Instruction *NewLoad; + if (BlockInMask || MaskForGaps) { + assert(useMaskedInterleavedAccesses(*TTI) && + "masked interleaved groups are not allowed."); + Value *GroupMask = CreateGroupMask(MaskForGaps); + NewLoad = + Builder.CreateMaskedLoad(VecTy, AddrV, Group->getAlign(), GroupMask, + PoisonVec, "wide.masked.vec"); + } else + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrV, Group->getAlign(), + "wide.vec"); + Group->addMetadata(NewLoad); if (VecTy->isScalableTy()) { assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); - for (unsigned Part = 0; Part < UF; ++Part) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - Value *DI = Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - if (!Member) - continue; + Value *DI = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2, + VecTy, NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; - Value *StridedVec = Builder.CreateExtractValue(DI, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } + Value *StridedVec = Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); + } - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); + if (Group->isReverse()) + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - State.set(VPDefs[J], StridedVec, Part); - ++J; - } + State.set(VPDefs[J], StridedVec); + ++J; } return; @@ -2603,22 +2591,20 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto StrideMask = createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); - for (unsigned Part = 0; Part < UF; Part++) { - Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); - - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } + Value *StridedVec = + Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); + } - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); + if (Group->isReverse()) + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - State.set(VPDefs[J], StridedVec, Part); - } + State.set(VPDefs[J], StridedVec); ++J; } return; @@ -2634,63 +2620,54 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( "masked interleaved groups are not allowed."); assert((!MaskForGaps || !VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); - for (unsigned Part = 0; Part < UF; Part++) { - // Collect the stored vector from each member. - SmallVector StoredVecs; - unsigned StoredIdx = 0; - for (unsigned i = 0; i < InterleaveFactor; i++) { - assert((Group->getMember(i) || MaskForGaps) && - "Fail to get a member from an interleaved store group"); - Instruction *Member = Group->getMember(i); + // Collect the stored vector from each member. + SmallVector StoredVecs; + unsigned StoredIdx = 0; + for (unsigned i = 0; i < InterleaveFactor; i++) { + assert((Group->getMember(i) || MaskForGaps) && + "Fail to get a member from an interleaved store group"); + Instruction *Member = Group->getMember(i); - // Skip the gaps in the group. - if (!Member) { - Value *Undef = PoisonValue::get(SubVT); - StoredVecs.push_back(Undef); - continue; - } + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } - Value *StoredVec = State.get(StoredValues[StoredIdx], Part); - ++StoredIdx; + Value *StoredVec = State.get(StoredValues[StoredIdx]); + ++StoredIdx; - if (Group->isReverse()) - StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); + if (Group->isReverse()) + StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); - // If this member has different type, cast it to a unified type. + // If this member has different type, cast it to a unified type. - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); - StoredVecs.push_back(StoredVec); - } + StoredVecs.push_back(StoredVec); + } - // Interleave all the smaller vectors into one wider vector. - Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); - Instruction *NewStoreInstr; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], - Group->getAlign(), GroupMask); - } else - NewStoreInstr = - Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); + Instruction *NewStoreInstr; + if (BlockInMask || MaskForGaps) { + Value *GroupMask = CreateGroupMask(MaskForGaps); + NewStoreInstr = + Builder.CreateMaskedStore(IVec, AddrV, Group->getAlign(), GroupMask); + } else + NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrV, Group->getAlign()); - Group->addMetadata(NewStoreInstr); - } + Group->addMetadata(NewStoreInstr); } void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, - const VPIteration &Instance, + const VPLane &Lane, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for - // the first lane and part. - if (isa(Instr)) - if (!Instance.isFirstIteration()) - return; - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2713,18 +2690,18 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (const auto &I : enumerate(RepRecipe->operands())) { - auto InputInstance = Instance; + auto InputLane = Lane; VPValue *Operand = I.value(); if (vputils::isUniformAfterVectorization(Operand)) - InputInstance.Lane = VPLane::getFirstLane(); - Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); + InputLane = VPLane::getFirstLane(); + Cloned->setOperand(I.index(), State.get(Operand, InputLane)); } State.addNewMetadata(Cloned, Instr); // Place the cloned scalar in the new loop. State.Builder.Insert(Cloned); - State.set(RepRecipe, Cloned, Instance); + State.set(RepRecipe, Cloned, Lane); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) @@ -3250,7 +3227,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); assert(StepVPV && "step must have been expanded during VPlan execution"); Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() - : State.get(StepVPV, {0, 0}); + : State.get(StepVPV, VPLane(0)); Value *Escape = emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II.getKind(), II.getInductionBinOp()); @@ -3474,11 +3451,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO, // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. VPValue *VPExtract = LO->getOperand(0); - using namespace llvm::VPlanPatternMatch; - assert(match(VPExtract, m_VPInstruction( - m_VPValue(), m_VPValue())) && - "FOR LiveOut expects to use an extract from end."); - Value *ResumeScalarFOR = State.get(VPExtract, UF - 1, true); + Value *ResumeScalarFOR = State.get(VPExtract, true); // Fix the initial value of the original recurrence in the scalar loop. PHINode *ScalarHeaderPhi = LO->getPhi(); @@ -3579,13 +3552,13 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, VPWidenPHIRecipe *VPPhi = dyn_cast(&P); if (!VPPhi) continue; - PHINode *NewPhi = cast(State.get(VPPhi, 0)); + PHINode *NewPhi = cast(State.get(VPPhi)); // Make sure the builder has a valid insert point. Builder.SetInsertPoint(NewPhi); for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { VPValue *Inc = VPPhi->getIncomingValue(i); VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); - NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); + NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); } } } @@ -7360,8 +7333,8 @@ static void createAndCollectMergePhiForReduction( auto *PhiR = cast(RedResult->getOperand(0)); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - Value *FinalValue = - State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); + TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Value *FinalValue = State.get(RedResult, VPLane::getFirstLane()); auto *ResumePhi = dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( @@ -7428,6 +7401,8 @@ LoopVectorizationPlanner::executePlan( "expanded SCEVs to reuse can only be used during epilogue vectorization"); (void)IsEpilogueVectorization; + VPlanTransforms::interleave(BestVPlan, BestUF, + OrigLoop->getHeader()->getModule()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF @@ -7447,7 +7422,7 @@ LoopVectorizationPlanner::executePlan( BestVPlan.getPreheader()->execute(&State); } if (!ILV.getTripCount()) - ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); + ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); else assert(IsEpilogueVectorization && "should only re-use the existing trip " "count during epilogue vectorization"); @@ -9150,40 +9125,52 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { "Recipe should have been replaced"); auto *IVR = getParent()->getPlan()->getCanonicalIV(); - PHINode *CanonicalIV = cast(State.get(IVR, 0, /*IsScalar*/ true)); + PHINode *CanonicalIV = cast(State.get(IVR, /*IsScalar*/ true)); + unsigned Part = 0; + if (getNumOperands() == 4) + Part = cast(getOperand(3)->getLiveInIRValue())->getZExtValue(); Type *PhiType = IndDesc.getStep()->getType(); // Build a pointer phi Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", - CanonicalIV->getIterator()); + PHINode *NewPointerPhi = nullptr; BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + if (getNumOperands() == 4) { + auto *GEP = cast(State.get(getOperand(2))); + NewPointerPhi = cast(GEP->getPointerOperand()); + } else { + NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); + NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + } // A pointer induction, performed by using a gep BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); - Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); + Value *ScalarStepValue = State.get(getOperand(1), VPLane(0)); Value *NumUnrolledElems = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); - Value *InductionGEP = GetElementPtrInst::Create( - State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", - InductionLoc); + // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. - NewPointerPhi->addIncoming(InductionGEP, VectorPH); + if (getNumOperands() != 4) { + Value *InductionGEP = GetElementPtrInst::Create( + State.Builder.getInt8Ty(), NewPointerPhi, + State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", + InductionLoc); + + NewPointerPhi->addIncoming(InductionGEP, VectorPH); + } // Create UF many actual address geps that use the pointer // phi as base and a vectorized version of the step value // () as offset. - for (unsigned Part = 0; Part < State.UF; ++Part) { Type *VecPhiType = VectorType::get(PhiType, State.VF); Value *StartOffsetScalar = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); @@ -9193,7 +9180,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); - assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && + assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( State.Builder.getInt8Ty(), NewPointerPhi, @@ -9201,31 +9188,30 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { StartOffset, State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), "vector.gep")); - State.set(this, GEP, Part); - } + State.set(this, GEP, 0); } void VPDerivedIVRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "VPDerivedIVRecipe being replicated."); + assert(!State.Lane && "VPDerivedIVRecipe being replicated."); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (FPBinOp) State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); - Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0)); + Value *Step = State.get(getStepValue(), VPLane(0)); + Value *CanonicalIV = State.get(getOperand(1), VPLane(0)); Value *DerivedIV = emitTransformedIndex( State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present(FPBinOp)); DerivedIV->setName("offset.idx"); assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); - State.set(this, DerivedIV, VPIteration(0, 0)); + State.set(this, DerivedIV, VPLane(0)); } void VPInterleaveRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Interleave group being replicated."); + assert(!State.Lane && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), getStoredValues(), getMask(), NeedsMaskForGaps); @@ -9233,43 +9219,26 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); - if (State.Instance) { // Generate a single instance. + if (State.Lane) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); + State.ILV->scalarizeInstruction(UI, this, *State.Lane, State); // Insert scalar instance packing it into a vector. if (State.VF.isVector() && shouldPack()) { // If we're constructing lane 0, initialize to start from poison. - if (State.Instance->Lane.isFirstLane()) { + if (State.Lane->isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(UI->getType(), State.VF)); - State.set(this, Poison, State.Instance->Part); + State.set(this, Poison); } - State.packScalarIntoVectorValue(this, *State.Instance); + State.packScalarIntoVectorValue(this, *State.Lane); } return; } if (IsUniform) { - // If the recipe is uniform across all parts (instead of just per VF), only - // generate a single instance. - if ((isa(UI) || isa(UI)) && - all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideVectorRegions(); - })) { - State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); - if (user_begin() != user_end()) { - for (unsigned Part = 1; Part < State.UF; ++Part) - State.set(this, State.get(this, VPIteration(0, 0)), - VPIteration(Part, 0)); - } - return; - } - - // Uniform within VL means we need to generate lane 0 only for each - // unrolled copy. - for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); + // Uniform within VL means we need to generate lane 0 only. + State.ILV->scalarizeInstruction(UI, this, VPLane(0), State); return; } @@ -9278,17 +9247,15 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (isa(UI) && vputils::isUniformAfterVectorization(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); - State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), - State); + State.ILV->scalarizeInstruction(UI, this, Lane, State); return; } - // Generate scalar instances for all VF lanes of all UF parts. + // Generate scalar instances for all VF lanes. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); - for (unsigned Part = 0; Part < State.UF; ++Part) - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); + for (unsigned Lane = 0; Lane < EndLane; ++Lane) + State.ILV->scalarizeInstruction(UI, this, Lane, State); } void VPWidenLoadRecipe::execute(VPTransformState &State) { @@ -9301,18 +9268,17 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; Value *Mask = nullptr; if (auto *VPMask = getMask()) { // Mask reversal is only needed for non-all-one (null) masks, as reverse // of a null all-one mask is a null mask. - Mask = State.get(VPMask, Part); + Mask = State.get(VPMask); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); } - Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); + Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); if (CreateGather) { NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, "wide.masked.gather"); @@ -9327,8 +9293,7 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { State.addMetadata(NewLI, LI); if (Reverse) NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); - State.set(this, NewLI, Part); - } + State.set(this, NewLI); } /// Use all-true mask for reverse rather than actual mask, as it avoids a @@ -9355,11 +9320,11 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); CallInst *NewLI; - Value *EVL = State.get(getEVL(), VPIteration(0, 0)); - Value *Addr = State.get(getAddr(), 0, !CreateGather); + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *Addr = State.get(getAddr(), !CreateGather); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask, 0); + Mask = State.get(VPMask); if (isReverse()) Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { @@ -9382,7 +9347,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Instruction *Res = NewLI; if (isReverse()) Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); - State.set(this, Res, 0); + State.set(this, Res); } void VPWidenStoreRecipe::execute(VPTransformState &State) { @@ -9395,18 +9360,17 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; Value *Mask = nullptr; if (auto *VPMask = getMask()) { // Mask reversal is only needed for non-all-one (null) masks, as reverse // of a null all-one mask is a null mask. - Mask = State.get(VPMask, Part); + Mask = State.get(VPMask); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); } - Value *StoredVal = State.get(StoredVPValue, Part); + Value *StoredVal = State.get(StoredVPValue); if (isReverse()) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. @@ -9414,7 +9378,7 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); + Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); if (CreateScatter) NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); else if (Mask) @@ -9422,7 +9386,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { else NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); State.addMetadata(NewSI, SI); - } } void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { @@ -9438,19 +9401,19 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); CallInst *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, 0); - Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + Value *StoredVal = State.get(StoredValue); + Value *EVL = State.get(getEVL(), VPLane(0)); if (isReverse()) StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask, 0); + Mask = State.get(VPMask); if (isReverse()) Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } - Value *Addr = State.get(getAddr(), 0, !CreateScatter); + Value *Addr = State.get(getAddr(), !CreateScatter); if (CreateScatter) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f17be451e6846..cf4d5f0e4cba8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -223,47 +223,47 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {} -Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { +Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) return Def->getLiveInIRValue(); - if (hasScalarValue(Def, Instance)) { - return Data - .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)]; + if (hasScalarValue(Def, Lane)) { + return Data.Scalars[Def][Lane.mapToCacheIndex(VF)]; } - assert(hasVectorValue(Def, Instance.Part)); - auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; + assert(hasVectorValue(Def)); + auto *VecPart = Data.Output[Def]; if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); + assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar"); return VecPart; } // TODO: Cache created scalar values. - Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); - auto *Extract = Builder.CreateExtractElement(VecPart, Lane); - // set(Def, Extract, Instance); + Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF); + auto *Extract = Builder.CreateExtractElement(VecPart, LaneV); + // set(Def, Extract, Lane); return Extract; } -Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) { +Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { if (NeedsScalar) { - assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) || - (hasScalarValue(Def, VPIteration(Part, 0)) && - Data.PerPartScalars[Def][Part].size() == 1)) && - "Trying to access a single scalar per part but has multiple scalars " - "per part."); - return get(Def, VPIteration(Part, 0)); + assert( + (VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def) || + (hasScalarValue(Def, VPLane(0)) && Data.Scalars[Def].size() == 1)) && + "Trying to access a single scalar per part but has multiple scalars " + "per part."); + return get(Def, VPLane(0)); } // If Values have been set for this Def return the one relevant for \p Part. - if (hasVectorValue(Def, Part)) - return Data.PerPartOutput[Def][Part]; + if (hasVectorValue(Def)) + return Data.Output[Def]; auto GetBroadcastInstrs = [this, Def](Value *V) { bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); if (VF.isScalar()) return V; - // Place the code for broadcasting invariant variables in the new preheader. + // Place the code for broadcasting invariant variables in the new + // preheader. IRBuilder<>::InsertPointGuard Guard(Builder); if (SafeToHoist) { BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast( @@ -272,56 +272,55 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); } - // Place the code for broadcasting invariant variables in the new preheader. - // Broadcast the scalar into all locations in the vector. + // Place the code for broadcasting invariant variables in the new + // preheader. Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); return Shuf; }; - if (!hasScalarValue(Def, {Part, 0})) { + if (!hasScalarValue(Def, VPLane(0))) { assert(Def->isLiveIn() && "expected a live-in"); - if (Part != 0) - return get(Def, 0); Value *IRV = Def->getLiveInIRValue(); Value *B = GetBroadcastInstrs(IRV); - set(Def, B, Part); + set(Def, B); return B; } - Value *ScalarValue = get(Def, {Part, 0}); + Value *ScalarValue = get(Def, VPLane(0)); // If we aren't vectorizing, we can just copy the scalar map values over // to the vector map. if (VF.isScalar()) { - set(Def, ScalarValue, Part); + set(Def, ScalarValue); return ScalarValue; } bool IsUniform = vputils::isUniformAfterVectorization(Def); - unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; + VPLane LastLane = VPLane(IsUniform ? 0 : VF.getKnownMinValue() - 1); // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, {Part, LastLane})) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be uniform. + if (!hasScalarValue(Def, LastLane)) { + // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes + // and VPExpandSCEVRecipes can also be uniform. assert((isa(Def->getDefiningRecipe()) || isa(Def->getDefiningRecipe()) || isa(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; - LastLane = 0; + LastLane = VPLane(0); } - auto *LastInst = cast(get(Def, {Part, LastLane})); - // Set the insert point after the last scalarized instruction or after the - // last PHI, if LastInst is a PHI. This ensures the insertelement sequence - // will directly follow the scalar definitions. auto OldIP = Builder.saveIP(); - auto NewIP = - isa(LastInst) - ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) - : std::next(BasicBlock::iterator(LastInst)); - Builder.SetInsertPoint(&*NewIP); + if (auto *LastInst = dyn_cast(get(Def, LastLane))) { + // Set the insert point after the last scalarized instruction or after the + // last PHI, if LastInst is a PHI. This ensures the insertelement sequence + // will directly follow the scalar definitions. + auto NewIP = + isa(LastInst) + ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) + : std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + } // However, if we are vectorizing, we need to construct the vector values. // If the value is known to be uniform after vectorization, we can just @@ -332,15 +331,16 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) { Value *VectorValue = nullptr; if (IsUniform) { VectorValue = GetBroadcastInstrs(ScalarValue); - set(Def, VectorValue, Part); + set(Def, VectorValue); } else { // Initialize packing with insertelements to start from undef. assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); - set(Def, Undef, Part); + Value *Undef = + PoisonValue::get(VectorType::get(ScalarValue->getType(), VF)); + set(Def, Undef); for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - packScalarIntoVectorValue(Def, {Part, Lane}); - VectorValue = get(Def, Part); + packScalarIntoVectorValue(Def, Lane); + VectorValue = get(Def); } Builder.restoreIP(OldIP); return VectorValue; @@ -392,12 +392,12 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) { } void VPTransformState::packScalarIntoVectorValue(VPValue *Def, - const VPIteration &Instance) { - Value *ScalarInst = get(Def, Instance); - Value *VectorValue = get(Def, Instance.Part); - VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF)); - set(Def, VectorValue, Instance.Part); + const VPLane &Lane) { + Value *ScalarInst = get(Def, Lane); + Value *VectorValue = get(Def); + VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, + Lane.getAsRuntimeExpr(Builder, VF)); + set(Def, VectorValue); } BasicBlock * @@ -453,7 +453,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) { } void VPBasicBlock::execute(VPTransformState *State) { - bool Replica = State->Instance && !State->Instance->isFirstIteration(); + bool Replica = State->Lane && !State->Lane->isFirstLane(); VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; VPBlockBase *SingleHPred = nullptr; BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. @@ -724,27 +724,24 @@ void VPRegionBlock::execute(VPTransformState *State) { return; } - assert(!State->Instance && "Replicating a Region with non-null instance."); + assert(!State->Lane && "Replicating a Region with non-null instance."); // Enter replicating mode. - State->Instance = VPIteration(0, 0); - - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { - State->Instance->Part = Part; - assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); - for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; - ++Lane) { - State->Instance->Lane = VPLane(Lane, VPLane::Kind::First); - // Visit the VPBlocks connected to \p this, starting from it. - for (VPBlockBase *Block : RPOT) { - LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); - Block->execute(State); - } + State->Lane = VPLane(0); + + assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; + ++Lane) { + State->Lane = VPLane(Lane, VPLane::Kind::First); + // Visit the VPBlocks connected to \p this, starting from it. + for (VPBlockBase *Block : RPOT) { + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); } } // Exit replicating mode. - State->Instance.reset(); + State->Lane.reset(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -816,10 +813,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, // FIXME: Model VF * UF computation completely in VPlan. VFxUF.setUnderlyingValue( createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF)); + if (VF.getNumUsers() > 0) { + VF.setUnderlyingValue( + createStepForVF(Builder, TripCountV->getType(), State.VF, 1)); + } // When vectorizing the epilogue loop, the canonical induction start value // needs to be changed from zero to the value after the main vector loop. - // FIXME: Improve modeling for canonical IV start values in the epilogue loop. + // FIXME: Improve modeling for canonical IV start values in the epilogue + // loop. if (CanonicalIVStartValue) { VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue); auto *IV = getCanonicalIV(); @@ -871,12 +873,12 @@ void VPlan::execute(VPTransformState *State) { isa(&R)) { PHINode *Phi = nullptr; if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue(), 0)); + Phi = cast(State->get(R.getVPSingleValue())); } else { auto *WidenPhi = cast(&R); assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi, 0)); + auto *GEP = cast(State->get(WidenPhi)); Phi = cast(GEP->getPointerOperand()); } @@ -885,6 +887,9 @@ void VPlan::execute(VPTransformState *State) { // Move the last step to the end of the latch block. This ensures // consistent placement of all induction updates. Instruction *Inc = cast(Phi->getIncomingValue(1)); + if (isa(&R) && R.getNumOperands() == 4) + Inc->setOperand(0, State->get(R.getOperand(3))); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); continue; } @@ -894,24 +899,13 @@ void VPlan::execute(VPTransformState *State) { // only a single part is generated, which provides the last part from the // previous iteration. For non-ordered reductions all UF parts are // generated. - bool SinglePartNeeded = - isa(PhiR) || - isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isOrdered()); bool NeedsScalar = isa(PhiR) || (isa(PhiR) && cast(PhiR)->isInLoop()); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *Phi = State->get(PhiR, Part, NeedsScalar); - Value *Val = - State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); - } + Value *Phi = State->get(PhiR, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } State->CFG.DTU.flush(); @@ -1249,6 +1243,10 @@ void VPlanIngredient::print(raw_ostream &O) const { template void DomTreeBuilder::Calculate(VPDominatorTree &DT); +bool VPValue::isDefinedOutsideVectorRegions() const { + return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent(); +} + void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5bb88e4a57dc3..84087295def21 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -164,6 +164,7 @@ class VPLane { public: VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} + VPLane(unsigned Lane) : Lane(Lane), LaneKind(Kind::First) {} static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } @@ -252,93 +253,76 @@ struct VPTransformState { /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. - std::optional Instance; + std::optional Lane; struct DataState { - /// A type for vectorized values in the new loop. Each value from the - /// original loop, when vectorized, is represented by UF vector values in - /// the new unrolled loop, where UF is the unroll factor. - typedef SmallVector PerPartValuesTy; + /// A type for vectorized values in the new loop. - DenseMap PerPartOutput; + DenseMap Output; - using ScalarsPerPartValuesTy = SmallVector, 2>; - DenseMap PerPartScalars; + using ScalarsPerPartValuesTy = SmallVector; + DenseMap Scalars; } Data; /// Get the generated vector Value for a given VPValue \p Def and a given \p /// Part if \p IsScalar is false, otherwise return the generated scalar /// for \p Part. \See set. - Value *get(VPValue *Def, unsigned Part, bool IsScalar = false); + Value *get(VPValue *Def, bool IsScalar = false); /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPIteration &Instance); + Value *get(VPValue *Def, const VPLane &Lane); - bool hasVectorValue(VPValue *Def, unsigned Part) { - auto I = Data.PerPartOutput.find(Def); - return I != Data.PerPartOutput.end() && Part < I->second.size() && - I->second[Part]; + bool hasVectorValue(VPValue *Def) { + auto I = Data.Output.find(Def); + return I != Data.Output.end() && I->second; } - bool hasScalarValue(VPValue *Def, VPIteration Instance) { - auto I = Data.PerPartScalars.find(Def); - if (I == Data.PerPartScalars.end()) + bool hasScalarValue(VPValue *Def, VPLane Lane) { + auto I = Data.Scalars.find(Def); + if (I == Data.Scalars.end()) return false; - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - return Instance.Part < I->second.size() && - CacheIdx < I->second[Instance.Part].size() && - I->second[Instance.Part][CacheIdx]; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + return CacheIdx < I->second.size() && I->second[CacheIdx]; } /// Set the generated vector Value for a given VPValue and a given Part, if \p /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0). - void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) { + void set(VPValue *Def, Value *V, bool IsScalar = false) { if (IsScalar) { - set(Def, V, VPIteration(Part, 0)); + set(Def, V, VPLane(0)); return; } assert((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (Part, 0)"); - if (!Data.PerPartOutput.count(Def)) { - DataState::PerPartValuesTy Entry(UF); - Data.PerPartOutput[Def] = Entry; - } - Data.PerPartOutput[Def][Part] = V; + Data.Output[Def] = V; } /// Reset an existing vector value for \p Def and a given \p Part. - void reset(VPValue *Def, Value *V, unsigned Part) { - auto Iter = Data.PerPartOutput.find(Def); - assert(Iter != Data.PerPartOutput.end() && - "need to overwrite existing value"); - Iter->second[Part] = V; + void reset(VPValue *Def, Value *V) { + auto Iter = Data.Output.find(Def); + assert(Iter != Data.Output.end() && "need to overwrite existing value"); + Iter->second = V; } - /// Set the generated scalar \p V for \p Def and the given \p Instance. - void set(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.insert({Def, {}}); - auto &PerPartVec = Iter.first->second; - if (PerPartVec.size() <= Instance.Part) - PerPartVec.resize(Instance.Part + 1); - auto &Scalars = PerPartVec[Instance.Part]; - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + /// Set the generated scalar \p V for \p Def and the given \p Lane. + void set(VPValue *Def, Value *V, const VPLane &Lane) { + auto Iter = Data.Scalars.insert({Def, {}}); + auto &Scalars = Iter.first->second; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); if (Scalars.size() <= CacheIdx) Scalars.resize(CacheIdx + 1); assert(!Scalars[CacheIdx] && "should overwrite existing value"); Scalars[CacheIdx] = V; } - /// Reset an existing scalar value for \p Def and a given \p Instance. - void reset(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.find(Def); - assert(Iter != Data.PerPartScalars.end() && - "need to overwrite existing value"); - assert(Instance.Part < Iter->second.size() && - "need to overwrite existing value"); - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - assert(CacheIdx < Iter->second[Instance.Part].size() && + /// Reset an existing scalar value for \p Def and a given \p Lane. + void reset(VPValue *Def, Value *V, const VPLane &Lane) { + auto Iter = Data.Scalars.find(Def); + assert(Iter != Data.Scalars.end() && "need to overwrite existing value"); + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second.size() && "need to overwrite existing value"); - Iter->second[Instance.Part][CacheIdx] = V; + Iter->second[CacheIdx] = V; } /// Add additional metadata to \p To that was not present on \p Orig. @@ -359,7 +343,7 @@ struct VPTransformState { void setDebugLocFrom(DebugLoc DL); /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance); + void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. @@ -1224,12 +1208,12 @@ class VPInstruction : public VPRecipeWithIRFlags { /// modeled instruction for a given part. \returns the generated value for \p /// Part. In some cases an existing value is returned rather than a generated /// one. - Value *generatePerPart(VPTransformState &State, unsigned Part); + Value *generate(VPTransformState &State); /// Utility methods serving execute(): generates a scalar single instance of /// the modeled instruction for a given lane. \returns the scalar generated /// value for lane \p Lane. - Value *generatePerLane(VPTransformState &State, const VPIteration &Lane); + Value *generatePerLane(VPTransformState &State, const VPLane &Lane); #if !defined(NDEBUG) /// Return true if the VPInstruction is a floating point math operation, i.e. @@ -1422,7 +1406,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { Type *ResultTy; - Value *generate(VPTransformState &State, unsigned Part); + Value *generate(VPTransformState &State); public: VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) @@ -2604,6 +2588,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { return getStartValue()->getLiveInIRValue()->getType(); } + /// Returns the scalar type of the induction. + Type *getScalarType() { return getOperand(0)->getLiveInIRValue()->getType(); } + /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && @@ -2637,7 +2624,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { ~VPActiveLaneMaskPHIRecipe() override = default; VPActiveLaneMaskPHIRecipe *clone() override { - return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + R->addOperand(getOperand(1)); + return R; } VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) @@ -3144,6 +3133,7 @@ class VPlan { /// Represents the loop-invariant VF * UF of the vector loop region. VPValue VFxUF; + VPValue VF; /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. @@ -3232,6 +3222,7 @@ class VPlan { /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } + VPValue &getVF() { return VF; } void addVF(ElementCount VF) { VFs.insert(VF); } @@ -3665,6 +3656,29 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { return VPI->isVectorToScalar(); return false; } + +/// Checks if \p C is uniform across all VFs and UFs. It is considered as such +/// if it is either defined outside the vector region or its operand is known to +/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). +inline bool isUniformAcrossVFsAndUFs(VPValue *V) { + if (auto *VPI = dyn_cast_or_null(V->getDefiningRecipe())) { + return VPI == + VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue(); + } + if (isa(V)) + return true; + if (isa(V) && cast(V)->isUniform() && + (isa(V->getUnderlyingValue())) && + all_of(V->getDefiningRecipe()->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); })) + return true; + + auto *C = dyn_cast_or_null(V->getDefiningRecipe()); + return C && (C->isDefinedOutsideVectorRegions() || + isa(C->getOperand(0)) || + isa(C->getOperand(0))); +} + } // end namespace vputils } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a482455473e4..0afb2c5884c2b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -198,8 +198,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { assert(MiddleVPBB->getNumSuccessors() == 0 && "the middle block must not have any successors"); BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; - Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), - MiddleBB); + Phi->addIncoming(State.get(ExitValue, Lane), MiddleBB); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -311,7 +310,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { } Value *VPInstruction::generatePerLane(VPTransformState &State, - const VPIteration &Lane) { + const VPLane &Lane) { IRBuilderBase &Builder = State.Builder; assert(getOpcode() == VPInstruction::PtrAdd && @@ -320,13 +319,13 @@ Value *VPInstruction::generatePerLane(VPTransformState &State, State.get(getOperand(1), Lane), Name); } -Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { +Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; if (Instruction::isBinaryOp(getOpcode())) { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); - Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); + Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast(Res)) @@ -336,25 +335,25 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { switch (getOpcode()) { case VPInstruction::Not: { - Value *A = State.get(getOperand(0), Part); + Value *A = State.get(getOperand(0)); return Builder.CreateNot(A, Name); } case Instruction::ICmp: { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *A = State.get(getOperand(0)); + Value *B = State.get(getOperand(1)); return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { - Value *Cond = State.get(getOperand(0), Part); - Value *Op1 = State.get(getOperand(1), Part); - Value *Op2 = State.get(getOperand(2), Part); + Value *Cond = State.get(getOperand(0)); + Value *Op1 = State.get(getOperand(1)); + Value *Op2 = State.get(getOperand(2)); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); + Value *ScalarTC = State.get(getOperand(1), VPLane(0)); // If this part of the active lane mask is scalar, generate the CMP directly // to avoid unnecessary extracts. @@ -382,18 +381,14 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { // v3 = vector(v1(3), v2(0, 1, 2)) // For the first part, use the recurrence phi (v1), otherwise v2. - auto *V1 = State.get(getOperand(0), 0); - Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); + Value *PartMinus1 = State.get(getOperand(0)); if (!PartMinus1->getType()->isVectorTy()) return PartMinus1; - Value *V2 = State.get(getOperand(1), Part); + Value *V2 = State.get(getOperand(1)); return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); } case VPInstruction::CalculateTripCountMinusVF: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); - - Value *ScalarTC = State.get(getOperand(0), {0, 0}); + Value *ScalarTC = State.get(getOperand(0), VPLane(0)); Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); Value *Sub = Builder.CreateSub(ScalarTC, Step); @@ -418,30 +413,28 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { }; // TODO: Restructure this code with an explicit remainder loop, vsetvli can // be outside of the main loop. - assert(Part == 0 && "No unrolling expected for predicated vectorization."); // Compute VTC - IV as the AVL (requested vector length). - Value *Index = State.get(getOperand(0), VPIteration(0, 0)); - Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); + Value *Index = State.get(getOperand(0), VPLane(0)); + Value *TripCount = State.get(getOperand(1), VPLane(0)); Value *AVL = State.Builder.CreateSub(TripCount, Index); Value *EVL = GetEVL(State, AVL); return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { - auto *IV = State.get(getOperand(0), VPIteration(0, 0)); - if (Part == 0) - return IV; - - // The canonical IV is incremented by the vectorization factor (num of SIMD - // elements) times the unroll part. - Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); - return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), - hasNoSignedWrap()); + auto *IV = State.get(getOperand(0), VPLane(0)); + if (getNumOperands() == 2) { + // The canonical IV is incremented by the vectorization factor (num of + // SIMD elements) times the unroll part. + Value *Step = createStepForVF( + Builder, IV->getType(), State.VF, + cast(getOperand(1)->getLiveInIRValue())->getZExtValue()); + return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), + hasNoSignedWrap()); + } + return IV; } case VPInstruction::BranchOnCond: { - if (Part != 0) - return nullptr; - - Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); + Value *Cond = State.get(getOperand(0), VPLane(0)); VPRegionBlock *ParentRegion = getParent()->getParent(); VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); @@ -459,11 +452,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return CondBr; } case VPInstruction::BranchOnCount: { - if (Part != 0) - return nullptr; // First create the compare. - Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); - Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); + Value *IV = State.get(getOperand(0), /*IsScalar*/ true); + Value *TC = State.get(getOperand(1), /*IsScalar*/ true); Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. @@ -483,8 +474,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return CondBr; } case VPInstruction::ComputeReductionResult: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); + unsigned NumParts = getNumOperands() - 1; // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. @@ -495,11 +485,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { RecurKind RK = RdxDesc.getRecurrenceKind(); - VPValue *LoopExitingDef = getOperand(1); Type *PhiTy = OrigPhi->getType(); - VectorParts RdxParts(State.UF); - for (unsigned Part = 0; Part < State.UF; ++Part) - RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); + VectorParts RdxParts(NumParts); + for (unsigned Part = 0; Part != NumParts; ++Part) + RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the @@ -507,7 +496,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { // TODO: Handle this in truncateToMinBW. if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); - for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Part = 0; Part < NumParts; ++Part) RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -517,12 +506,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Op = Instruction::Or; if (PhiR->isOrdered()) { - ReducedPartRdx = RdxParts[State.UF - 1]; + ReducedPartRdx = RdxParts[NumParts - 1]; } else { // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < State.UF; ++Part) { + for (unsigned Part = 1; Part < NumParts; ++Part) { Value *RdxPart = RdxParts[Part]; if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( @@ -558,38 +547,28 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return ReducedPartRdx; } case VPInstruction::ExtractFromEnd: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); - auto *CI = cast(getOperand(1)->getLiveInIRValue()); unsigned Offset = CI->getZExtValue(); assert(Offset > 0 && "Offset from end must be positive"); Value *Res; - if (State.VF.isVector()) { - assert(Offset <= State.VF.getKnownMinValue() && - "invalid offset to extract from"); - // Extract lane VF - Offset from the operand. - Res = State.get( - getOperand(0), - VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); - } else { - assert(Offset <= State.UF && "invalid offset to extract from"); - // When loop is unrolled without vectorizing, retrieve UF - Offset. - Res = State.get(getOperand(0), State.UF - Offset); - } + assert(State.VF.isVector()); + assert(Offset <= State.VF.getKnownMinValue() && + "invalid offset to extract from"); + // Extract lane VF - Offset from the operand. + Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); Res->setName(Name); return Res; } case VPInstruction::LogicalAnd: { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *A = State.get(getOperand(0)); + Value *B = State.get(getOperand(1)); return Builder.CreateLogicalAnd(A, B, Name); } case VPInstruction::PtrAdd: { assert(vputils::onlyFirstLaneUsed(this) && "can only generate first lane for PtrAdd"); - Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); - Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); + Value *Ptr = State.get(getOperand(0), /* IsScalar */ true); + Value *Addend = State.get(getOperand(1), /* IsScalar */ true); return Builder.CreatePtrAdd(Ptr, Addend, Name); } default: @@ -614,7 +593,7 @@ bool VPInstruction::isFPMathOp() const { #endif void VPInstruction::execute(VPTransformState &State) { - assert(!State.Instance && "VPInstruction executing an Instance"); + assert(!State.Lane && "VPInstruction executing an Lane"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); assert((hasFastMathFlags() == isFPMathOp() || getOpcode() == Instruction::Select) && @@ -627,35 +606,26 @@ void VPInstruction::execute(VPTransformState &State) { (vputils::onlyFirstLaneUsed(this) || isVectorToScalar()); bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); - for (unsigned Part = 0; Part < State.UF; ++Part) { if (GeneratesPerAllLanes) { for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); Lane != NumLanes; ++Lane) { - Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); + Value *GeneratedValue = generatePerLane(State, Lane); assert(GeneratedValue && "generatePerLane must produce a value"); - State.set(this, GeneratedValue, VPIteration(Part, Lane)); + State.set(this, GeneratedValue, VPLane(Lane)); } - continue; - } - - if (Part != 0 && OnlyFirstPartUsed && hasResult()) { - Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); - State.set(this, Part0, Part, - /*IsScalar*/ GeneratesPerFirstLaneOnly); - continue; + return; } - Value *GeneratedValue = generatePerPart(State, Part); + Value *GeneratedValue = generate(State); if (!hasResult()) - continue; - assert(GeneratedValue && "generatePerPart must produce a value"); + return; + assert(GeneratedValue && "generate must produce a value"); assert((GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || State.VF.isScalar()) && "scalar value but not only first lane defined"); - State.set(this, GeneratedValue, Part, + State.set(this, GeneratedValue, /*IsScalar*/ GeneratesPerFirstLaneOnly); - } } bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { @@ -782,61 +752,59 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { FunctionType *VFTy = nullptr; if (Variant) VFTy = Variant->getFunctionType(); - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector TysForDecl; - // Add return type if intrinsic is overloaded on it. + SmallVector TysForDecl; + // Add return type if intrinsic is overloaded on it. + if (UseIntrinsic && + isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) + TysForDecl.push_back(VectorType::get( + CalledScalarFn->getReturnType()->getScalarType(), State.VF)); + SmallVector Args; + for (const auto &I : enumerate(arg_operands())) { + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + Value *Arg; if (UseIntrinsic && - isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) - TysForDecl.push_back(VectorType::get( - CalledScalarFn->getReturnType()->getScalarType(), State.VF)); - SmallVector Args; - for (const auto &I : enumerate(arg_operands())) { - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - Value *Arg; - if (UseIntrinsic && - isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) - Arg = State.get(I.value(), VPIteration(0, 0)); - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. This needs to be the scalar value - // from the start of the respective part when interleaving. - else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) - Arg = State.get(I.value(), VPIteration(Part, 0)); - else - Arg = State.get(I.value(), Part); - if (UseIntrinsic && - isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) - TysForDecl.push_back(Arg->getType()); - Args.push_back(Arg); - } + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) + Arg = State.get(I.value(), VPLane(0)); + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. This needs to be the scalar value + // from the start of the respective part when interleaving. + else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) + Arg = State.get(I.value(), VPLane(0)); + else + Arg = State.get(I.value()); + if (UseIntrinsic && + isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) + TysForDecl.push_back(Arg->getType()); + Args.push_back(Arg); + } - Function *VectorF; - if (UseIntrinsic) { - // Use vector version of the intrinsic. - Module *M = State.Builder.GetInsertBlock()->getModule(); - VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); - assert(VectorF && "Can't retrieve vector intrinsic."); - } else { + Function *VectorF; + if (UseIntrinsic) { + // Use vector version of the intrinsic. + Module *M = State.Builder.GetInsertBlock()->getModule(); + VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); + assert(VectorF && "Can't retrieve vector intrinsic."); + } else { #ifndef NDEBUG - assert(Variant != nullptr && "Can't create vector function."); + assert(Variant != nullptr && "Can't create vector function."); #endif - VectorF = Variant; - } + VectorF = Variant; + } - auto *CI = cast_or_null(getUnderlyingInstr()); - SmallVector OpBundles; - if (CI) - CI->getOperandBundlesAsDefs(OpBundles); + auto *CI = cast_or_null(getUnderlyingInstr()); + SmallVector OpBundles; + if (CI) + CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); + CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); - if (isa(V)) - V->copyFastMathFlags(CI); + if (isa(V)) + V->copyFastMathFlags(CI); - if (!V->getType()->isVoidTy()) - State.set(this, V, Part); - State.addMetadata(V, CI); - } + if (!V->getType()->isVoidTy()) + State.set(this, V); + State.addMetadata(V, CI); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -890,16 +858,14 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. auto *InvarCond = - isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); - } + isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; + + Value *Cond = InvarCond ? InvarCond : State.get(getCond()); + Value *Op0 = State.get(getOperand(1)); + Value *Op1 = State.get(getOperand(2)); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, 0); + State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); } VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( @@ -982,53 +948,47 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); - Value *V = Builder.CreateNAryOp(Opcode, Ops); + SmallVector Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp)); - if (auto *VecOp = dyn_cast(V)) - setFlags(VecOp); + Value *V = Builder.CreateNAryOp(Opcode, Ops); - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, dyn_cast_or_null(getUnderlyingValue())); - } + if (auto *VecOp = dyn_cast(V)) + setFlags(VecOp); + + // Use this vector value for all users of the original instruction. + State.set(this, V, 0); + State.addMetadata(V, dyn_cast_or_null(getUnderlyingValue())); break; } case Instruction::Freeze: { - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); + Value *Op = State.get(getOperand(0)); - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, 0); break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = Opcode == Instruction::FCmp; - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (auto *I = dyn_cast_or_null(getUnderlyingValue())) - Builder.setFastMathFlags(I->getFastMathFlags()); - C = Builder.CreateFCmp(getPredicate(), A, B); - } else { - C = Builder.CreateICmp(getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, dyn_cast_or_null(getUnderlyingValue())); + Value *A = State.get(getOperand(0)); + Value *B = State.get(getOperand(1)); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (auto *I = dyn_cast_or_null(getUnderlyingValue())) + Builder.setFastMathFlags(I->getFastMathFlags()); + C = Builder.CreateFCmp(getPredicate(), A, B); + } else { + C = Builder.CreateICmp(getPredicate(), A, B); } - + State.set(this, C, 0); + State.addMetadata(C, dyn_cast_or_null(getUnderlyingValue())); break; } default: @@ -1041,11 +1001,9 @@ void VPWidenRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) // Verify that VPlan type inference results agree with the type of the // generated values. - for (unsigned Part = 0; Part < State.UF; ++Part) { - assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), - State.VF) == State.get(this, Part)->getType() && - "inferred type and type from generated instructions do not match"); - } + assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == + State.get(this)->getType() && + "inferred type and type from generated instructions do not match"); #endif } @@ -1067,17 +1025,10 @@ void VPWidenCastRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "Not vectorizing?"); Type *DestTy = VectorType::get(getResultType(), State.VF); VPValue *Op = getOperand(0); - for (unsigned Part = 0; Part < State.UF; ++Part) { - if (Part > 0 && Op->isLiveIn()) { - // FIXME: Remove once explicit unrolling is implemented using VPlan. - State.set(this, State.get(this, 0), Part); - continue; - } - Value *A = State.get(Op, Part); - Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, cast_or_null(getUnderlyingValue())); - } + Value *A = State.get(Op); + Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); + State.set(this, Cast, 0); + State.addMetadata(Cast, cast_or_null(getUnderlyingValue())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1161,7 +1112,7 @@ static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Int or FP induction being replicated."); + assert(!State.Lane && "Int or FP induction being replicated."); Value *Start = getStartValue()->getLiveInIRValue(); const InductionDescriptor &ID = getInductionDescriptor(); @@ -1180,7 +1131,7 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); // Now do the actual transformations, and start with fetching the step value. - Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + Value *Step = State.get(getStepValue(), VPLane(0)); assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -1201,8 +1152,7 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector( SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); - - // We create vector phi nodes for both integer and floating-point induction + // We create vector phi nodes for both integer and floating-point induction{ // variables. Here, we determine the kind of arithmetic we will perform. Instruction::BinaryOps AddOp; Instruction::BinaryOps MulOp; @@ -1214,24 +1164,30 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { MulOp = Instruction::FMul; } - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(State.VF, cast(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); + Value *SplatVF; + if (getNumOperands() == 4) + // Need to create stuff in PH. + SplatVF = State.get(getOperand(2)); + else { + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it + // doesn't handle a constant vector splat. + SplatVF = isa(Mul) + ? ConstantVector::getSplat(State.VF, cast(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + } Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1239,17 +1195,16 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); VecInd->setDebugLoc(EntryVal->getDebugLoc()); + Instruction *LastInduction = VecInd; - for (unsigned Part = 0; Part < State.UF; ++Part) { - State.set(this, LastInduction, Part); + State.set(this, LastInduction, 0); - if (isa(EntryVal)) - State.addMetadata(LastInduction, EntryVal); + if (isa(EntryVal)) + State.addMetadata(LastInduction, EntryVal); - LastInduction = cast( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - } + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, VectorPH); @@ -1314,8 +1269,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step. - Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); - Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + Value *BaseIV = State.get(getOperand(0), VPLane(0)); + Value *Step = State.get(getStepValue(), VPLane(0)); IRBuilderBase &Builder = State.Builder; // Ensure step has the same type as that of scalar IV. @@ -1350,47 +1305,45 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); } - unsigned StartPart = 0; - unsigned EndPart = State.UF; unsigned StartLane = 0; unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); - if (State.Instance) { - StartPart = State.Instance->Part; - EndPart = StartPart + 1; - StartLane = State.Instance->Lane.getKnownLane(); + unsigned Part = getNumOperands() == 2 + ? 0 + : cast(getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + if (State.Lane) { + StartLane = State.Lane->getKnownLane(); EndLane = StartLane + 1; } - for (unsigned Part = StartPart; Part < EndPart; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - - if (!FirstLaneOnly && State.VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); - auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); - if (BaseIVTy->isFloatingPointTy()) - InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); - auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); - auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); - State.set(this, Add, Part); - // It's useful to record the lane values too for the known minimum number - // of elements so we do those below. This improves the code quality when - // trying to extract the first element, for example. - } + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); + if (!FirstLaneOnly && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (BaseIVTy->isFloatingPointTy()) - StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); - - for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { - Value *StartIdx = Builder.CreateBinOp( - AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); - // The step returned by `createStepForVF` is a runtime-evaluated value - // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((State.VF.isScalable() || isa(StartIdx)) && - "Expected StartIdx to be folded to a constant when VF is not " - "scalable"); - auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); - auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); - State.set(this, Add, VPIteration(Part, Lane)); - } + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(this, Add); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (BaseIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); + + for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((State.VF.isScalable() || isa(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); + State.set(this, Add, VPLane(Lane)); } } @@ -1428,16 +1381,14 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // the lane-zero scalar value. SmallVector Ops; for (unsigned I = 0, E = getNumOperands(); I != E; I++) - Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); + Ops.push_back(State.get(getOperand(I), VPLane(0))); auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], ArrayRef(Ops).drop_front(), "", isInBounds()); - for (unsigned Part = 0; Part < State.UF; ++Part) { Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); - State.set(this, EntryPart, Part); + State.set(this, EntryPart, 0); State.addMetadata(EntryPart, GEP); - } } else { // If the GEP has at least one loop-varying operand, we are sure to // produce a vector of pointers. But if we are only unrolling, we want @@ -1446,23 +1397,21 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // (otherwise). Note that for the unroll-only case, we still maintain // values in the vector mapping with initVector, as we do for other // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = isPointerLoopInvariant() - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (isIndexLoopInvariant(I - 1)) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } + auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) + : State.get(getOperand(0)); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (isIndexLoopInvariant(I - 1)) + Indices.push_back(State.get(Operand, VPLane(0))); + else + Indices.push_back(State.get(Operand)); + } // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. @@ -1470,9 +1419,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { Indices, "", isInBounds()); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); + State.set(this, NewGEP, 0); State.addMetadata(NewGEP, GEP); - } } } @@ -1495,39 +1443,40 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe ::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = nullptr; - // Use i32 for the gep index type when the value is constant, - // or query DataLayout for a more suitable index type otherwise. - const DataLayout &DL = - Builder.GetInsertBlock()->getModule()->getDataLayout(); - Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) - ? DL.getIndexType(IndexedTy->getPointerTo()) - : Builder.getInt32Ty(); - Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); - bool InBounds = isInBounds(); - if (IsReverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul( - ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = - Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); - PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); - PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); - } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); - PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); - } - - State.set(this, PartPtr, Part, /*IsScalar*/ true); + unsigned Part = getNumOperands() == 1 + ? 0 + : cast(getOperand(1)->getLiveInIRValue()) + ->getZExtValue(); + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = nullptr; + // Use i32 for the gep index type when the value is constant, + // or query DataLayout for a more suitable index type otherwise. + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) + ? DL.getIndexType(IndexedTy->getPointerTo()) + : Builder.getInt32Ty(); + Value *Ptr = State.get(getOperand(0), VPLane(0)); + bool InBounds = isInBounds(); + if (IsReverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = + Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = + Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); + PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); + } else { + Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); } + + State.set(this, PartPtr, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1561,26 +1510,23 @@ void VPBlendRecipe::execute(VPTransformState &State) { // In0))) // Note that Mask0 is never used: lanes for which no path reaches this phi and // are essentially undef are taken from In0. - VectorParts Entry(State.UF); - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } - } - } - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); + Value *Res = nullptr; + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + for (unsigned In = 0; In < NumIncoming; ++In) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); + if (In == 0) + Res = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); + Res = State.Builder.CreateSelect(Cond, In0, Res, "predphi"); + } + } + + State.set(this, Res, OnlyFirstLaneUsed); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1608,27 +1554,26 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPReductionRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Reduction being replicated."); - Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); + assert(!State.Lane && "Reduction being replicated."); + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); RecurKind Kind = RdxDesc.getRecurrenceKind(); // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewVecOp = State.get(getVecOp(), Part); - if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); - VectorType *VecTy = dyn_cast(NewVecOp->getType()); - Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); - Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, - RdxDesc.getFastMathFlags()); - if (State.VF.isVector()) { - Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); - } - - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); - NewVecOp = Select; + Value *NewVecOp = State.get(getVecOp()); + if (VPValue *Cond = getCondOp()) { + Value *NewCond = State.get(Cond, State.VF.isScalar()); + VectorType *VecTy = dyn_cast(NewVecOp->getType()); + Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); + Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, + RdxDesc.getFastMathFlags()); + if (State.VF.isVector()) { + Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); } + + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); + NewVecOp = Select; + } Value *NewRed; Value *NextInChain; if (IsOrdered) { @@ -1641,7 +1586,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { NewVecOp); PrevInChain = NewRed; } else { - PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); + PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { @@ -1652,8 +1597,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { else NextInChain = State.Builder.CreateBinOp( (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); - State.set(this, NextInChain, Part, /*IsScalar*/ true); - } + State.set(this, NextInChain, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1720,18 +1664,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -/// Checks if \p C is uniform across all VFs and UFs. It is considered as such -/// if it is either defined outside the vector region or its operand is known to -/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). -/// TODO: Uniformity should be associated with a VPValue and there should be a -/// generic way to check. -static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { - return C->isDefinedOutsideVectorRegions() || - isa(C->getOperand(0)) || - isa(C->getOperand(0)); -} - -Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { +Value *VPScalarCastRecipe ::generate(VPTransformState &State) { assert(vputils::onlyFirstLaneUsed(this) && "Codegen only implemented for first lane."); switch (Opcode) { @@ -1739,7 +1672,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { case Instruction::ZExt: case Instruction::Trunc: { // Note: SExt/ZExt not used yet. - Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); + Value *Op = State.get(getOperand(0), /*IsScalar*/ true); return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); } default: @@ -1748,17 +1681,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { } void VPScalarCastRecipe ::execute(VPTransformState &State) { - bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); - for (unsigned Part = 0; Part != State.UF; ++Part) { - Value *Res; - // Only generate a single instance, if the recipe is uniform across UFs and - // VFs. - if (Part > 0 && IsUniformAcrossVFsAndUFs) - Res = State.get(this, VPIteration(0, 0)); - else - Res = generate(State, Part); - State.set(this, Res, VPIteration(Part, 0)); - } + State.set(this, generate(State), /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1773,15 +1696,14 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, #endif void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); + assert(State.Lane && "Branch on Mask works only on single instance."); - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane.getKnownLane(); + unsigned Lane = State.Lane->getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); + ConditionBit = State.get(BlockInMask); if (ConditionBit->getType()->isVectorTy()) ConditionBit = State.Builder.CreateExtractElement( ConditionBit, State.Builder.getInt32(Lane)); @@ -1799,9 +1721,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { } void VPPredInstPHIRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Predicated instruction PHI works per instance."); + assert(State.Lane && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = - cast(State.get(getOperand(0), *State.Instance)); + cast(State.get(getOperand(0), *State.Lane)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); @@ -1814,33 +1736,32 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // needed. In this case the recipe of the predicated instruction is marked to // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. - unsigned Part = State.Instance->Part; - if (State.hasVectorValue(getOperand(0), Part)) { - Value *VectorValue = State.get(getOperand(0), Part); + if (State.hasVectorValue(getOperand(0))) { + Value *VectorValue = State.get(getOperand(0)); InsertElementInst *IEI = cast(VectorValue); PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. - if (State.hasVectorValue(this, Part)) - State.reset(this, VPhi, Part); + if (State.hasVectorValue(this)) + State.reset(this, VPhi); else - State.set(this, VPhi, Part); + State.set(this, VPhi); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. - State.reset(getOperand(0), VPhi, Part); + State.reset(getOperand(0), VPhi); } else { Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); - if (State.hasScalarValue(this, *State.Instance)) - State.reset(this, Phi, *State.Instance); + if (State.hasScalarValue(this, *State.Lane)) + State.reset(this, Phi, *State.Lane); else - State.set(this, Phi, *State.Instance); + State.set(this, Phi, *State.Lane); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. - State.reset(getOperand(0), Phi, *State.Instance); + State.reset(getOperand(0), Phi, *State.Lane); } } @@ -1890,8 +1811,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); EntryPart->addIncoming(Start, VectorPH); EntryPart->setDebugLoc(getDebugLoc()); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, EntryPart, Part, /*IsScalar*/ true); + State.set(this, EntryPart, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1935,11 +1855,17 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = WIDEN-POINTER-INDUCTION "; getStartValue()->printAsOperand(O, SlotTracker); O << ", " << *IndDesc.getStep(); + if (getNumOperands() == 4) { + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(3)->printAsOperand(O, SlotTracker); + } } #endif void VPExpandSCEVRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "cannot be used in per-lane"); + assert(!State.Lane && "cannot be used in per-lane"); const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); SCEVExpander Exp(SE, DL, "induction"); @@ -1948,8 +1874,7 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.ExpandedSCEVs.contains(Expr) && "Same SCEV expanded multiple times"); State.ExpandedSCEVs[Expr] = Res; - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, Res, {Part, 0}); + State.set(this, Res, VPLane(0)); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1962,23 +1887,26 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); + Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; Value *VStart = VF.isScalar() ? CanonicalIV : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *VStep = createStepForVF(Builder, STy, VF, Part); - if (VF.isVector()) { - VStep = Builder.CreateVectorSplat(VF, VStep); - VStep = - Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); - } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV, Part); + unsigned Part = getNumOperands() == 1 + ? 0 + : cast(getOperand(1)->getLiveInIRValue()) + ->getZExtValue(); + + Value *VStep = createStepForVF(Builder, STy, VF, Part); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = + Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); } + Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(this, CanonicalVectorIV); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2048,17 +1976,19 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); - unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); - EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, EntryPart, Part, IsInLoop); - } + Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); + EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); + State.set(this, EntryPart, IsInLoop); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); + unsigned Part = getNumOperands() == 2 + ? 0 + : cast(getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { // MinMax and AnyOf reductions have the start value as their identity. @@ -2075,21 +2005,22 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { RdxDesc.getFastMathFlags()); if (!ScalarPHI) { - Iden = Builder.CreateVectorSplat(State.VF, Iden); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(VectorPH->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + if (Part != 0) { + Iden = Builder.CreateVectorSplat(State.VF, Iden); + } else { + Iden = Builder.CreateVectorSplat(State.VF, Iden); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } } } - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = State.get(this, Part, IsInLoop); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; - cast(EntryPart)->addIncoming(StartVal, VectorPH); - } + // Make sure to add the reduction start value only to the + // first unroll part. + Value *StartVal = (Part == 0) ? StartV : Iden; + cast(EntryPart)->addIncoming(StartVal, VectorPH); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2107,7 +2038,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); - Value *Op0 = State.get(getOperand(0), 0); + Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); State.set(this, VecPhi, 0); @@ -2138,14 +2069,12 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *StartMask = State.get(getOperand(0), Part); - PHINode *EntryPart = - State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); - EntryPart->addIncoming(StartMask, VectorPH); - EntryPart->setDebugLoc(getDebugLoc()); - State.set(this, EntryPart, Part); - } + Value *StartMask = State.get(getOperand(0)); + PHINode *EntryPart = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(getDebugLoc()); + State.set(this, EntryPart); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2162,12 +2091,12 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); - Value *Start = State.get(getOperand(0), VPIteration(0, 0)); + Value *Start = State.get(getOperand(0), VPLane(0)); PHINode *EntryPart = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); EntryPart->addIncoming(Start, VectorPH); EntryPart->setDebugLoc(getDebugLoc()); - State.set(this, EntryPart, 0, /*IsScalar=*/true); + State.set(this, EntryPart, /*IsScalar=*/true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8ec67eb2f54bd..9c23377d6a451 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1572,3 +1572,396 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } } } + +static VPValue *getInterleavedValue( + DenseMap> &InterleavedValues, VPValue *V, + unsigned IC) { + + if (V->isDefinedOutsideVectorRegions()) + return V; + if (IC == 0) + return V; + if (V->isLiveIn()) + return V; + return InterleavedValues[V][IC - 1]; +} + +static void interleaveReplicateRegion( + VPRegionBlock *VPR, VPlan &Plan, unsigned IC, + DenseMap> &InterleavedValues) { + Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + VPBlockBase *InsertPt = VPR; + for (unsigned I = 1; I != IC; ++I) { + auto *Copy = VPR->clone(); + VPBlockUtils::insertBlockAfter(Copy, InsertPt); + InsertPt = Copy; + + ReversePostOrderTraversal> + RPOT(Copy->getEntry()); + ReversePostOrderTraversal> + RPOT2(VPR->getEntry()); + for (const auto &[New, Old] : + zip(VPBlockUtils::blocksOnly(RPOT), + VPBlockUtils::blocksOnly(RPOT2))) { + if (New->getParent() != Copy) + break; + for (const auto &[CopyR, OrigR] : zip(*New, *Old)) { + for (unsigned Idx = 0; Idx != CopyR.getNumOperands(); ++Idx) { + CopyR.setOperand(Idx, getInterleavedValue(InterleavedValues, + CopyR.getOperand(Idx), I)); + } + if (auto *ScalarIVSteps = dyn_cast(&CopyR)) { + ScalarIVSteps->addOperand( + Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + } + + unsigned Idx = 0; + for (VPValue *Res : OrigR.definedValues()) { + auto Ins = InterleavedValues.insert({Res, {}}); + Ins.first->second.push_back(CopyR.getVPValue(Idx)); + Idx++; + } + } + } + } +} + +static void interleaveHeaderPHI( + VPRecipeBase &R, VPlan &Plan, unsigned IC, + VPBasicBlock::iterator &InsertPtForPhi, + DenseMap> &InterleavedValues, + VPTypeAnalysis &TypeInfo, SmallPtrSet &ToSkip, + SmallVector> &PhisToRemap) { + if (isa(&R)) + return; + + // Generate step vectors for each unrolled part. + if (auto *IV = dyn_cast(&R)) { + VPBasicBlock *PH = + cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); + VPValue *Step = &Plan.getVF(); + Type *IVTy = TypeInfo.inferScalarType(IV); + auto &ID = IV->getInductionDescriptor(); + FastMathFlags FMFs; + if (ID.getInductionBinOp() && isa(ID.getInductionBinOp())) + FMFs = ID.getInductionBinOp()->getFastMathFlags(); + + if (TypeInfo.inferScalarType(Step) != IVTy) { + Instruction::CastOps CastOp; + if (IVTy->isFloatingPointTy()) + CastOp = Instruction::UIToFP; + else + CastOp = Instruction::Trunc; + Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType()); + PH->appendRecipe(Step->getDefiningRecipe()); + ToSkip.insert(Step->getDefiningRecipe()); + } + + auto *ConstScale = + IV->getOperand(1)->isLiveIn() + ? dyn_cast(IV->getOperand(1)->getLiveInIRValue()) + : nullptr; + if (!ConstScale || ConstScale->getZExtValue() != 1) { + VPValue *Scale = IV->getOperand(1); + if (TypeInfo.inferScalarType(Scale) != IVTy) { + Scale = new VPWidenCastRecipe(Instruction::Trunc, Scale, + IV->getScalarType()); + PH->appendRecipe(Scale->getDefiningRecipe()); + ToSkip.insert(Scale->getDefiningRecipe()); + } + + VPBuilder Builder(PH); + VPInstruction *Mul; + if (IVTy->isFloatingPointTy()) + Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale}, + R.getDebugLoc(), "", FMFs); + else + Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale}, + R.getDebugLoc()); + Step = Mul; + ToSkip.insert(Mul); + } + R.addOperand(Step); + + for (unsigned I = 1; I != IC; ++I) { + VPBuilder Builder; + Builder.setInsertPoint(R.getParent(), InsertPtForPhi); + auto Ins = InterleavedValues.insert({IV, {}}); + VPValue *Prev = getInterleavedValue(InterleavedValues, IV, I - 1); + VPInstruction *Add; + std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add"; + + if (IVTy->isFloatingPointTy()) + Add = Builder.createFPOp(ID.getInductionOpcode(), + { + Prev, + Step, + }, + R.getDebugLoc(), Name, FMFs); + else + Add = Builder.createNaryOp(Instruction::Add, + { + Prev, + Step, + }, + R.getDebugLoc(), Name); + ToSkip.insert(Add); + Ins.first->second.push_back(Add); + InsertPtForPhi = std::next(Add->getIterator()); + } + R.addOperand(getInterleavedValue(InterleavedValues, IV, IC - 1)); + return; + } + + VPRecipeBase *InsertPt = &R; + Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + for (unsigned I = 1; I != IC; ++I) { + VPRecipeBase *Copy = R.clone(); + Copy->insertAfter(InsertPt); + InsertPt = Copy; + unsigned Idx = 0; + for (VPValue *Res : R.definedValues()) { + auto Ins = InterleavedValues.insert({Res, {}}); + Ins.first->second.push_back(Copy->getVPValue(Idx)); + Idx++; + } + if (isa(&R)) { + Copy->addOperand(R.getVPSingleValue()); + Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + continue; + } + + if (auto *RdxPhi = dyn_cast(&R)) { + if (RdxPhi->isOrdered()) { + Copy->eraseFromParent(); + break; + } + Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + } + + if (I == 1) + PhisToRemap.emplace_back(); + + auto *H = cast(Copy); + PhisToRemap.back().push_back(H); + if (auto *IV = dyn_cast(&R)) { + auto *ALM = cast(IV->getOperand(0)); + auto *ALMCopy = cast(ALM->clone()); + auto *P = cast(ALM->getOperand(0)); + auto *PCopy = cast(P->clone()); + ALMCopy->setOperand(0, PCopy); + + VPBasicBlock *PH = cast( + Plan.getVectorLoopRegion()->getSinglePredecessor()); + PH->appendRecipe(ALMCopy); + PCopy->insertBefore(ALM); + Copy->setOperand(0, ALMCopy); + PCopy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + } + } +} + +static void +interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC, + DenseMap> &InterleavedValues, + VPTypeAnalysis &TypeInfo) { + Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + if (auto *VPI = dyn_cast(&R)) { + if (VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnCond) + return; + } + + if (auto *RepR = dyn_cast(&R)) { + if (isa(RepR->getUnderlyingValue()) && + RepR->getOperand(1)->isDefinedOutsideVectorRegions()) { + R.setOperand( + 0, getInterleavedValue(InterleavedValues, R.getOperand(0), IC - 1)); + return; + } + if (auto *II = dyn_cast(RepR->getUnderlyingValue())) { + if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { + auto Ins = InterleavedValues.insert({RepR, {}}); + Ins.first->second.push_back(RepR); + return; + } + } + } + + // TODO: Generalize for any uniform recipe. + if (auto *Cast = dyn_cast(&R)) { + if (Cast->getOperand(0)->isLiveIn()) { + auto Ins = InterleavedValues.insert({Cast, {}}); + Ins.first->second.push_back(Cast); + return; + } + } + + if (isa(&R) && + vputils::onlyFirstPartUsed(R.getVPSingleValue())) { + auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}}); + for (unsigned I = 1; I != IC; ++I) { + Ins.first->second.push_back(R.getVPSingleValue()); + } + return; + } + + VPRecipeBase *InsertPt = &R; + for (unsigned I = 1; I != IC; ++I) { + VPRecipeBase *Copy = R.clone(); + Copy->insertAfter(InsertPt); + InsertPt = Copy; + unsigned Idx = 0; + for (VPValue *Res : R.definedValues()) { + auto Ins = InterleavedValues.insert({Res, {}}); + Ins.first->second.push_back(Copy->getVPValue(Idx)); + Idx++; + } + + if (auto *VPI = dyn_cast(&R)) { + if (VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { + Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + continue; + } + if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) { + Copy->setOperand( + 0, getInterleavedValue(InterleavedValues, R.getOperand(1), I - 1)); + Copy->setOperand( + 1, getInterleavedValue(InterleavedValues, R.getOperand(1), I)); + continue; + } + } + if (auto *Red = dyn_cast(&R)) { + auto *Phi = cast(R.getOperand(0)); + if (Phi->isOrdered()) { + auto Ins = InterleavedValues.insert({Phi, {}}); + if (I == 1) { + Ins.first->second.clear(); + Ins.first->second.push_back(Red); + } + Ins.first->second.push_back(Copy->getVPSingleValue()); + Phi->setOperand(1, Copy->getVPSingleValue()); + } + } + for (unsigned Idx = 0; Idx != Copy->getNumOperands(); ++Idx) + Copy->setOperand(Idx, getInterleavedValue(InterleavedValues, + Copy->getOperand(Idx), I)); + + // Add operand indicating the part to generate code for to recipes still + // requiring it. + if (isa(Copy)) + Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I))); + + if (isa(R)) + Copy->setOperand(0, R.getOperand(0)); + } +} + +void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) { + assert(IC > 0); + if (IC == 1) + return; + DenseMap> InterleavedValues; + + SmallPtrSet ToSkip; + + Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx); + ReversePostOrderTraversal> RPOT( + Plan.getVectorLoopRegion()->getEntry()); + SmallVector> PhisToRemap; + for (VPBlockBase *VPB : RPOT) { + auto *VPR = dyn_cast(VPB); + if (VPR) { + interleaveReplicateRegion(VPR, Plan, IC, InterleavedValues); + continue; + } + + auto *VPBB = cast(VPB); + auto InsertPtForPhi = VPBB->getFirstNonPhi(); + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (ToSkip.contains(&R)) + continue; + auto *SingleDef = dyn_cast(&R); + if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { + for (unsigned I = 1; I != IC; ++I) { + auto Ins = InterleavedValues.insert({SingleDef, {}}); + Ins.first->second.push_back(SingleDef); + } + continue; + } + + if (auto *H = dyn_cast(&R)) { + interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues, + TypeInfo, ToSkip, PhisToRemap); + continue; + } + + interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo); + } + } + + for (auto &R : PhisToRemap) { + unsigned I = 1; + for (VPHeaderPHIRecipe *H : R) { + for (unsigned Idx = 0; Idx != H->getNumOperands(); ++Idx) + H->setOperand( + Idx, getInterleavedValue(InterleavedValues, H->getOperand(Idx), I)); + I++; + } + } + + for (VPRecipeBase &H : + Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (!isa(&H)) { + continue; + } + H.setOperand( + 1, getInterleavedValue(InterleavedValues, H.getOperand(1), IC - 1)); + } + + using namespace llvm::VPlanPatternMatch; + for (VPRecipeBase &R : make_early_inc_range(*cast( + Plan.getVectorLoopRegion()->getSingleSuccessor()))) { + VPValue *Op1; + if (match(&R, m_VPInstruction( + m_VPValue(), m_VPValue(Op1)))) { + for (unsigned I = 1; I != IC; ++I) { + R.addOperand(getInterleavedValue(InterleavedValues, Op1, I)); + } + continue; + } + VPValue *Op0; + if (match(&R, m_VPInstruction( + m_VPValue(Op0), m_VPValue()))) { + bool ScalarVFOnly = Plan.hasScalarVFOnly(); + if (!ScalarVFOnly) { + R.setOperand(0, getInterleavedValue(InterleavedValues, Op0, IC - 1)); + continue; + } + } + } + + bool ScalarVFOnly = Plan.hasScalarVFOnly(); + for (const auto &[_, LO] : Plan.getLiveOuts()) { + VPValue *In = nullptr; + VPValue *Op0; + if (ScalarVFOnly && + match(LO->getOperand(0), m_VPInstruction( + m_VPValue(Op0), m_VPValue()))) { + VPInstruction *Extract = + cast(LO->getOperand(0)->getDefiningRecipe()); + unsigned Offset = + cast(Extract->getOperand(1)->getLiveInIRValue()) + ->getZExtValue(); + In = getInterleavedValue(InterleavedValues, Op0, IC - Offset); + LO->setOperand(0, In); + Extract->getDefiningRecipe()->eraseFromParent(); + continue; + } else + In = getInterleavedValue(InterleavedValues, LO->getOperand(0), IC - 1); + + LO->setOperand(0, In); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 96b8a6639723c..67d22ce46b6d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -106,6 +106,8 @@ struct VPlanTransforms { /// this transformation. /// \returns true if the transformation succeeds, or false if it doesn't. static bool tryAddExplicitVectorLength(VPlan &Plan); + + static void interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 8d945f6f2b8ea..cc4d69a2e7d82 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -184,7 +184,7 @@ class VPValue { /// Returns true if the VPValue is defined outside any vector regions, i.e. it /// is a live-in value. /// TODO: Also handle recipes defined in pre-header blocks. - bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); } + bool isDefinedOutsideVectorRegions() const; // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll index 22aaa563daa5a..b784c465f878e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -103,10 +103,10 @@ for.end: ; preds = %for.body ; CHECK-LABEL: @ptr_ind_plus2( ; CHECK: %[[V0:.*]] = load <8 x i32> -; CHECK: %[[V1:.*]] = load <8 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> -; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index ce1cfda438170..a5e6f891dd572 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -294,9 +294,9 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll index 3217f508f0adc..339e5ced88946 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -14,7 +14,7 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK: %ind.end = trunc i64 %n.vec to i7 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() +; CHECK: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i7 1, i64 0), poison, zeroinitializer) @@ -74,7 +74,7 @@ define void @induction_i3_zext(ptr %dst) #0 { ; CHECK: %ind.end = trunc i64 %n.vec to i3 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() +; CHECK: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i3 1, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index 8b64d7a083662..a52db89594c39 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -40,7 +40,10 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: [[TMP18:%.*]] = add [[DOTSPLAT]], [[TMP17]] ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP18]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-NEXT: [[VS2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VS2_1:%.*]] = mul i64 [[VS2]], 2 +; CHECK-NEXT: [[VS2_2:%.*]] = mul i64 [[VS2_1]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[VS2_1]], 1 ; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 ; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector [[DOTSPLATINSERT5]], poison, zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.experimental.stepvector.nxv2i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 61105e51cb946..0b959d99bc9be 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -39,10 +39,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load , ptr [[TMP10]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load , ptr [[TMP10]], align 4 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC2]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index 87bc77cb7767f..79c7e4b64c30b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -1227,12 +1227,12 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] ; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 ; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 +; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] ; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] @@ -1415,12 +1415,12 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] ; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 ; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8 -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8 +; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] ; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 0ecba2f304682..2b48cdd890a5f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -967,9 +967,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; FIXEDLEN-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index b4427864d4730..70a9fc4c5bd16 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -28,14 +28,13 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; IF-EVL-NEXT: [[TMP12:%.*]] = add [[TMP11]], zeroinitializer ; IF-EVL-NEXT: [[TMP13:%.*]] = mul [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; IF-EVL-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP15]] -; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP37]], i64 0 +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP15]], i64 0 ; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -109,12 +108,12 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0 ; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0 ; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 ; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 -; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 ; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; NO-VP-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; NO-VP-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] ; NO-VP-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index c1be67853bf7c..ba94663178bf4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -143,15 +143,17 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[L]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[L]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = mul <16 x i16> , [[TMP2]] + ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> , [[DOTSPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TMP0]], 16 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP2]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer + ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index d22ccf6671e84..d6db02b72e9d0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -629,9 +629,6 @@ define void @wide_iv_trunc_reuse(ptr %dst) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll index e64b02f00dfc1..3389941eeb992 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -19,13 +19,13 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] ; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]] ; SSE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 -; SSE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 ; SSE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] -; SSE-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] +; SSE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; SSE-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] ; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 16 ; SSE-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 @@ -61,21 +61,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX1-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX1-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX1-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] +; AVX1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 +; AVX1-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC5]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC5]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC8:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 +; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC8]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC8]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX1-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; AVX1-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC6]] +; AVX1-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC9]] ; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 16 ; AVX1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32 @@ -115,21 +115,21 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]] ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] ; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 -; AVX2-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 -; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 -; AVX2-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX2-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX2-NEXT: [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] +; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 +; AVX2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC5:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 +; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC8:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 +; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC8]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC8]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC6]] +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC9]] ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32 ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 64 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 97e1c1c4362fc..55f95c2baad3c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -101,24 +101,24 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; SSE41-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]] ; SSE41-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]] ; SSE41-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 -; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 -; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 ; SSE41-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> ; SSE41-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32> ; SSE41-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]] ; SSE41-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]] ; SSE41-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 ; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> ; SSE41-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> @@ -203,20 +203,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]] ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 -; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 +; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> ; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> @@ -227,20 +227,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] ; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 ; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 -; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 -; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 ; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 +; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 +; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 +; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 +; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 +; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 +; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> ; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index c5c80c3ff6992..d811878642300 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -214,28 +214,19 @@ define void @uniform_store_varying_value(ptr align(4) %addr) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 5 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 6 -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], 7 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP0]], 9 -; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], 10 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 11 -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP0]], 12 -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP0]], 13 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 14 -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP0]], 15 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 12 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 13 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 14 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 15 +; CHECK-NEXT: store i32 [[TMP7]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 665ac0e1fa445..39ade9ff95676 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -2754,66 +2754,66 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] -; UNROLL-NO-IC: pred.udiv.if5: +; UNROLL-NO-IC: pred.udiv.if7: ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]] -; UNROLL-NO-IC: pred.udiv.continue6: +; UNROLL-NO-IC: pred.udiv.continue8: ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] -; UNROLL-NO-IC: pred.udiv.if7: +; UNROLL-NO-IC: pred.udiv.if9: ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] -; UNROLL-NO-IC: pred.udiv.continue8: +; UNROLL-NO-IC: pred.udiv.continue10: ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] -; UNROLL-NO-IC: pred.udiv.if9: +; UNROLL-NO-IC: pred.udiv.if11: ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] -; UNROLL-NO-IC: pred.udiv.continue10: +; UNROLL-NO-IC: pred.udiv.continue12: ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] -; UNROLL-NO-IC: pred.udiv.if11: +; UNROLL-NO-IC: pred.udiv.if13: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]] ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE12]] -; UNROLL-NO-IC: pred.udiv.continue12: +; UNROLL-NO-IC: pred.udiv.continue14: ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ] ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] -; UNROLL-NO-IC: pred.udiv.if13: +; UNROLL-NO-IC: pred.udiv.if15: ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]] ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE14]] -; UNROLL-NO-IC: pred.udiv.continue14: +; UNROLL-NO-IC: pred.udiv.continue16: ; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ] ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] -; UNROLL-NO-IC: pred.udiv.if15: +; UNROLL-NO-IC: pred.udiv.if17: ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6 ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]] ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE16]] -; UNROLL-NO-IC: pred.udiv.continue16: +; UNROLL-NO-IC: pred.udiv.continue18: ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ] ; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]] -; UNROLL-NO-IC: pred.udiv.if17: +; UNROLL-NO-IC: pred.udiv.if19: ; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7 ; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]] ; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE18]] -; UNROLL-NO-IC: pred.udiv.continue18: +; UNROLL-NO-IC: pred.udiv.continue20: ; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ] ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> @@ -3064,59 +3064,59 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]] -; UNROLL-NO-IC: pred.udiv.if4: +; UNROLL-NO-IC: pred.udiv.if3: ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE5]] -; UNROLL-NO-IC: pred.udiv.continue5: +; UNROLL-NO-IC: pred.udiv.continue4: ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]] -; UNROLL-NO-IC: pred.udiv.if6: +; UNROLL-NO-IC: pred.udiv.if5: ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE7]] -; UNROLL-NO-IC: pred.udiv.continue7: +; UNROLL-NO-IC: pred.udiv.continue6: ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]] -; UNROLL-NO-IC: pred.udiv.if8: +; UNROLL-NO-IC: pred.udiv.if7: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]] ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]] -; UNROLL-NO-IC: pred.udiv.continue9: +; UNROLL-NO-IC: pred.udiv.continue8: ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ] ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]] -; UNROLL-NO-IC: pred.udiv.if10: +; UNROLL-NO-IC: pred.udiv.if9: ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]] -; UNROLL-NO-IC: pred.udiv.continue11: +; UNROLL-NO-IC: pred.udiv.continue10: ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ] ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]] -; UNROLL-NO-IC: pred.udiv.if12: +; UNROLL-NO-IC: pred.udiv.if11: ; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]] ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE13]] -; UNROLL-NO-IC: pred.udiv.continue13: +; UNROLL-NO-IC: pred.udiv.continue12: ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ] ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]] -; UNROLL-NO-IC: pred.udiv.if14: +; UNROLL-NO-IC: pred.udiv.if13: ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE15]] -; UNROLL-NO-IC: pred.udiv.continue15: +; UNROLL-NO-IC: pred.udiv.continue14: ; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ] ; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]] -; UNROLL-NO-IC: pred.udiv.if16: +; UNROLL-NO-IC: pred.udiv.if15: ; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE17]] -; UNROLL-NO-IC: pred.udiv.continue17: +; UNROLL-NO-IC: pred.udiv.continue16: ; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ] ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> @@ -3132,60 +3132,60 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC: pred.store.continue: ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] -; UNROLL-NO-IC: pred.store.if18: +; UNROLL-NO-IC: pred.store.if17: ; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = add i32 [[INDEX]], 1 ; UNROLL-NO-IC-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP52]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP3]], ptr [[TMP53]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE19]] -; UNROLL-NO-IC: pred.store.continue19: +; UNROLL-NO-IC: pred.store.continue18: ; UNROLL-NO-IC-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] -; UNROLL-NO-IC: pred.store.if20: +; UNROLL-NO-IC: pred.store.if19: ; UNROLL-NO-IC-NEXT: [[TMP55:%.*]] = add i32 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP55]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP4]], ptr [[TMP56]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE21]] -; UNROLL-NO-IC: pred.store.continue21: +; UNROLL-NO-IC: pred.store.continue20: ; UNROLL-NO-IC-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; UNROLL-NO-IC: pred.store.if22: +; UNROLL-NO-IC: pred.store.if21: ; UNROLL-NO-IC-NEXT: [[TMP58:%.*]] = add i32 [[INDEX]], 3 ; UNROLL-NO-IC-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP58]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP5]], ptr [[TMP59]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE23]] -; UNROLL-NO-IC: pred.store.continue23: +; UNROLL-NO-IC: pred.store.continue22: ; UNROLL-NO-IC-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; UNROLL-NO-IC: pred.store.if24: +; UNROLL-NO-IC: pred.store.if23: ; UNROLL-NO-IC-NEXT: [[TMP61:%.*]] = add i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP61]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP6]], ptr [[TMP62]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE25]] -; UNROLL-NO-IC: pred.store.continue25: +; UNROLL-NO-IC: pred.store.continue24: ; UNROLL-NO-IC-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] -; UNROLL-NO-IC: pred.store.if26: +; UNROLL-NO-IC: pred.store.if25: ; UNROLL-NO-IC-NEXT: [[TMP64:%.*]] = add i32 [[INDEX]], 5 ; UNROLL-NO-IC-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP64]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP7]], ptr [[TMP65]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE27]] -; UNROLL-NO-IC: pred.store.continue27: +; UNROLL-NO-IC: pred.store.continue26: ; UNROLL-NO-IC-NEXT: [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2 ; UNROLL-NO-IC-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] -; UNROLL-NO-IC: pred.store.if28: +; UNROLL-NO-IC: pred.store.if27: ; UNROLL-NO-IC-NEXT: [[TMP67:%.*]] = add i32 [[INDEX]], 6 ; UNROLL-NO-IC-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP67]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP8]], ptr [[TMP68]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE29]] -; UNROLL-NO-IC: pred.store.continue29: +; UNROLL-NO-IC: pred.store.continue28: ; UNROLL-NO-IC-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]] -; UNROLL-NO-IC: pred.store.if30: +; UNROLL-NO-IC: pred.store.if29: ; UNROLL-NO-IC-NEXT: [[TMP70:%.*]] = add i32 [[INDEX]], 7 ; UNROLL-NO-IC-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP70]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP9]], ptr [[TMP71]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE31]] -; UNROLL-NO-IC: pred.store.continue31: +; UNROLL-NO-IC: pred.store.continue30: ; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index bd658c31768a8..b9328a927b1fd 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -87,15 +87,15 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] +; VEC4_INTERL2-NEXT: [[FPINC_INS:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[FPINC_INS]], [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]] -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -334,15 +334,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[MUL:%.*]] = fmul reassoc <4 x float> [[DOTSPLATINSERT2]], [[MUL]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]] -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -841,29 +841,27 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; VEC4_INTERL2-NEXT: [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] +; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16 ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]] +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] @@ -878,7 +876,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT9]] +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC4_INTERL2: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index df61798853ef6..790640eb1323d 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1174,14 +1174,14 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], ; INTERLEAVE-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], @@ -1487,8 +1487,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1 ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8 -; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] @@ -5250,30 +5250,30 @@ define i32 @PR32419(i32 %a, i16 %b) { ; UNROLL-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] ; UNROLL-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i64 1 ; UNROLL-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; UNROLL: pred.urem.if3: +; UNROLL: pred.urem.if2: ; UNROLL-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], -19 ; UNROLL-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] ; UNROLL-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP11]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE4]] -; UNROLL: pred.urem.continue4: +; UNROLL: pred.urem.continue3: ; UNROLL-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ] ; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 ; UNROLL-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; UNROLL: pred.urem.if5: +; UNROLL: pred.urem.if4: ; UNROLL-NEXT: [[TMP15:%.*]] = add i16 [[TMP1]], -18 ; UNROLL-NEXT: [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]] ; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x i16> poison, i16 [[TMP16]], i64 0 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE6]] -; UNROLL: pred.urem.continue6: +; UNROLL: pred.urem.continue5: ; UNROLL-NEXT: [[TMP18:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ] ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 ; UNROLL-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] -; UNROLL: pred.urem.if7: +; UNROLL: pred.urem.if6: ; UNROLL-NEXT: [[TMP20:%.*]] = add i16 [[TMP1]], -17 ; UNROLL-NEXT: [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]] ; UNROLL-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP21]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE8]] -; UNROLL: pred.urem.continue8: +; UNROLL: pred.urem.continue7: ; UNROLL-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ] ; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[TMP13]], <2 x i16> zeroinitializer ; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP23]], <2 x i16> zeroinitializer @@ -5330,30 +5330,30 @@ define i32 @PR32419(i32 %a, i16 %b) { ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; UNROLL-NO-IC: pred.urem.if3: +; UNROLL-NO-IC: pred.urem.if2: ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i16 [[TMP1]], 1 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE4]] -; UNROLL-NO-IC: pred.urem.continue4: +; UNROLL-NO-IC: pred.urem.continue3: ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; UNROLL-NO-IC: pred.urem.if5: +; UNROLL-NO-IC: pred.urem.if4: ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = add i16 [[TMP1]], 2 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE6]] -; UNROLL-NO-IC: pred.urem.continue6: +; UNROLL-NO-IC: pred.urem.continue5: ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] -; UNROLL-NO-IC: pred.urem.if7: +; UNROLL-NO-IC: pred.urem.if6: ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = add i16 [[TMP1]], 3 ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP23]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE8]] -; UNROLL-NO-IC: pred.urem.continue8: +; UNROLL-NO-IC: pred.urem.continue7: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = phi <2 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ] ; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP15]] ; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP25]] @@ -5418,66 +5418,66 @@ define i32 @PR32419(i32 %a, i16 %b) { ; INTERLEAVE-NEXT: [[TMP8:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; INTERLEAVE: pred.urem.if3: +; INTERLEAVE: pred.urem.if2: ; INTERLEAVE-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], -19 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[TMP11]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE4]] -; INTERLEAVE: pred.urem.continue4: +; INTERLEAVE: pred.urem.continue3: ; INTERLEAVE-NEXT: [[TMP13:%.*]] = phi <4 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 ; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; INTERLEAVE: pred.urem.if5: +; INTERLEAVE: pred.urem.if4: ; INTERLEAVE-NEXT: [[TMP15:%.*]] = add i16 [[TMP1]], -18 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]] ; INTERLEAVE-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP16]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE6]] -; INTERLEAVE: pred.urem.continue6: +; INTERLEAVE: pred.urem.continue5: ; INTERLEAVE-NEXT: [[TMP18:%.*]] = phi <4 x i16> [ [[TMP13]], [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 ; INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]] -; INTERLEAVE: pred.urem.if7: +; INTERLEAVE: pred.urem.if6: ; INTERLEAVE-NEXT: [[TMP20:%.*]] = add i16 [[TMP1]], -17 ; INTERLEAVE-NEXT: [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]] ; INTERLEAVE-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP21]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE8]] -; INTERLEAVE: pred.urem.continue8: +; INTERLEAVE: pred.urem.continue7: ; INTERLEAVE-NEXT: [[TMP23:%.*]] = phi <4 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ] ; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]] -; INTERLEAVE: pred.urem.if9: +; INTERLEAVE: pred.urem.if8: ; INTERLEAVE-NEXT: [[TMP25:%.*]] = add i16 [[TMP1]], -16 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = urem i16 [[B]], [[TMP25]] ; INTERLEAVE-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP26]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE10]] -; INTERLEAVE: pred.urem.continue10: +; INTERLEAVE: pred.urem.continue9: ; INTERLEAVE-NEXT: [[TMP28:%.*]] = phi <4 x i16> [ poison, [[PRED_UREM_CONTINUE8]] ], [ [[TMP27]], [[PRED_UREM_IF9]] ] ; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 ; INTERLEAVE-NEXT: br i1 [[TMP29]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]] -; INTERLEAVE: pred.urem.if11: +; INTERLEAVE: pred.urem.if10: ; INTERLEAVE-NEXT: [[TMP30:%.*]] = add i16 [[TMP1]], -15 ; INTERLEAVE-NEXT: [[TMP31:%.*]] = urem i16 [[B]], [[TMP30]] ; INTERLEAVE-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP31]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE12]] -; INTERLEAVE: pred.urem.continue12: +; INTERLEAVE: pred.urem.continue11: ; INTERLEAVE-NEXT: [[TMP33:%.*]] = phi <4 x i16> [ [[TMP28]], [[PRED_UREM_CONTINUE10]] ], [ [[TMP32]], [[PRED_UREM_IF11]] ] ; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 ; INTERLEAVE-NEXT: br i1 [[TMP34]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]] -; INTERLEAVE: pred.urem.if13: +; INTERLEAVE: pred.urem.if12: ; INTERLEAVE-NEXT: [[TMP35:%.*]] = add i16 [[TMP1]], -14 ; INTERLEAVE-NEXT: [[TMP36:%.*]] = urem i16 [[B]], [[TMP35]] ; INTERLEAVE-NEXT: [[TMP37:%.*]] = insertelement <4 x i16> [[TMP33]], i16 [[TMP36]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE14]] -; INTERLEAVE: pred.urem.continue14: +; INTERLEAVE: pred.urem.continue13: ; INTERLEAVE-NEXT: [[TMP38:%.*]] = phi <4 x i16> [ [[TMP33]], [[PRED_UREM_CONTINUE12]] ], [ [[TMP37]], [[PRED_UREM_IF13]] ] ; INTERLEAVE-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 ; INTERLEAVE-NEXT: br i1 [[TMP39]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16]] -; INTERLEAVE: pred.urem.if15: +; INTERLEAVE: pred.urem.if14: ; INTERLEAVE-NEXT: [[TMP40:%.*]] = add i16 [[TMP1]], -13 ; INTERLEAVE-NEXT: [[TMP41:%.*]] = urem i16 [[B]], [[TMP40]] ; INTERLEAVE-NEXT: [[TMP42:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP41]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE16]] -; INTERLEAVE: pred.urem.continue16: +; INTERLEAVE: pred.urem.continue15: ; INTERLEAVE-NEXT: [[TMP43:%.*]] = phi <4 x i16> [ [[TMP38]], [[PRED_UREM_CONTINUE14]] ], [ [[TMP42]], [[PRED_UREM_IF15]] ] ; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP23]], <4 x i16> zeroinitializer ; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP43]], <4 x i16> zeroinitializer @@ -6379,12 +6379,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; UNROLL-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 +; UNROLL-NEXT: [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLATINSERT2]], +; UNROLL-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], -; UNROLL-NEXT: [[TMP16:%.*]] = shl i32 [[STEP]], 1 -; UNROLL-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -6457,13 +6457,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[DOTSPLAT3:%.*]] = mul <2 x i32> , [[TMP18]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = mul <2 x i32> , [[DOTSPLAT]] ; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP17]] -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = mul i32 [[STEP]], 2 -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -6537,12 +6537,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; INTERLEAVE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLATINSERT2]], +; INTERLEAVE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer + ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[DOTSPLAT]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i32 [[STEP]], 2 -; INTERLEAVE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP16]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index bcc8632da4c64..89c5c0226212b 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -216,7 +216,6 @@ define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) { ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[INDUCTION:%.*]] = add i32 [[TMP3]], 0 ; CHECK-NEXT: [[INDUCTION1]] = add i32 [[TMP3]], 1 -; CHECK-NEXT: store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: store i32 [[INDUCTION]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec @@ -290,7 +289,6 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) { ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 10 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr %dst, align 8 ; CHECK-NEXT: store ptr [[TMP11]], ptr %dst, align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index f05ec30619c5d..3b1f1fa7d550b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -92,28 +92,28 @@ define void @pr45679(ptr %A) optsize { ; VF2UF2: pred.store.continue: ; VF2UF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; VF2UF2: pred.store.if2: +; VF2UF2: pred.store.if1: ; VF2UF2-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1 ; VF2UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP6]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP7]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE3]] -; VF2UF2: pred.store.continue3: +; VF2UF2: pred.store.continue2: ; VF2UF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; VF2UF2-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; VF2UF2: pred.store.if4: +; VF2UF2: pred.store.if3: ; VF2UF2-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 2 ; VF2UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP10]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE5]] -; VF2UF2: pred.store.continue5: +; VF2UF2: pred.store.continue4: ; VF2UF2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.if6: +; VF2UF2: pred.store.if5: ; VF2UF2-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 3 ; VF2UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP12]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP13]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.continue7: +; VF2UF2: pred.store.continue6: ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 @@ -298,33 +298,33 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; VF2UF2-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ] ; VF2UF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; VF2UF2: pred.store.if2: +; VF2UF2: pred.store.if1: ; VF2UF2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 ; VF2UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] ; VF2UF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2UF2-NEXT: store i64 [[TMP10]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE3]] -; VF2UF2: pred.store.continue3: +; VF2UF2: pred.store.continue2: ; VF2UF2-NEXT: [[TMP11:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP10]], [[PRED_STORE_IF2]] ] ; VF2UF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; VF2UF2-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; VF2UF2: pred.store.if4: +; VF2UF2: pred.store.if3: ; VF2UF2-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 2 ; VF2UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] ; VF2UF2-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF2UF2-NEXT: store i64 [[TMP15]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE5]] -; VF2UF2: pred.store.continue5: +; VF2UF2: pred.store.continue4: ; VF2UF2-NEXT: [[TMP16:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE3]] ], [ [[TMP15]], [[PRED_STORE_IF4]] ] ; VF2UF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.if6: +; VF2UF2: pred.store.if5: ; VF2UF2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 3 ; VF2UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP18]] ; VF2UF2-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP19]], align 8 ; VF2UF2-NEXT: store i64 [[TMP20]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.continue7: +; VF2UF2: pred.store.continue6: ; VF2UF2-NEXT: [[TMP21:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE5]] ], [ [[TMP20]], [[PRED_STORE_IF6]] ] ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 306ec125dc202..657860a0440d2 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -97,153 +97,153 @@ define i32 @predicated(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: +; CHECK: pred.load.if4: ; CHECK-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: +; CHECK: pred.load.continue5: ; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 ; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: +; CHECK: pred.load.continue7: ; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP19]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 ; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP22:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: +; CHECK: pred.load.continue9: ; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP25]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP28:%.*]] = or disjoint i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> poison, i32 [[TMP30]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: +; CHECK: pred.load.continue11: ; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP31]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 ; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP34:%.*]] = or disjoint i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: +; CHECK: pred.load.continue13: ; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP37]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 ; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP40:%.*]] = or disjoint i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4 ; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP42]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: +; CHECK: pred.load.continue15: ; CHECK-NEXT: [[TMP44:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP43]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 ; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP46:%.*]] = or disjoint i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]] ; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 ; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: +; CHECK: pred.load.continue17: ; CHECK-NEXT: [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP49]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 ; CHECK-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP52:%.*]] = or disjoint i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP52]] ; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: +; CHECK: pred.load.continue19: ; CHECK-NEXT: [[TMP56:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP55]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 ; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP58:%.*]] = or disjoint i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: +; CHECK: pred.load.continue21: ; CHECK-NEXT: [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP61]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 ; CHECK-NEXT: br i1 [[TMP63]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP64:%.*]] = or disjoint i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP64]] ; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> [[TMP62]], i32 [[TMP66]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: +; CHECK: pred.load.continue23: ; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ [[TMP62]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP67]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 ; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP70:%.*]] = or disjoint i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP70]] ; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: +; CHECK: pred.load.continue25: ; CHECK-NEXT: [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP73]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 ; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP76:%.*]] = or disjoint i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP76]] ; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> poison, i32 [[TMP78]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: +; CHECK: pred.load.continue27: ; CHECK-NEXT: [[TMP80:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP79]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 ; CHECK-NEXT: br i1 [[TMP81]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP82:%.*]] = or disjoint i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP82]] ; CHECK-NEXT: [[TMP84:%.*]] = load i32, ptr [[TMP83]], align 4 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: +; CHECK: pred.load.continue29: ; CHECK-NEXT: [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP85]], [[PRED_LOAD_IF31]] ] ; CHECK-NEXT: [[TMP87:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 ; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP88:%.*]] = or disjoint i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP88]] ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP90]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: +; CHECK: pred.load.continue31: ; CHECK-NEXT: [[TMP92:%.*]] = phi <4 x i32> [ [[TMP86]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP91]], [[PRED_LOAD_IF33]] ] ; CHECK-NEXT: [[TMP93:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 ; CHECK-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.if35: +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP94:%.*]] = or disjoint i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP94]] ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: +; CHECK: pred.load.continue33: ; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP97]], [[PRED_LOAD_IF35]] ] ; CHECK-NEXT: [[TMP99:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP100:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP99]]) @@ -339,153 +339,153 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP18:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP20]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: +; CHECK: pred.load.continue7: ; CHECK-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP16]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP24:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP26]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: +; CHECK: pred.load.continue9: ; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP27]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3 ; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP30:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 ; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP32]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: +; CHECK: pred.load.continue11: ; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP33]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0 ; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP36:%.*]] = or disjoint i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP36]] ; CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP38]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: +; CHECK: pred.load.continue13: ; CHECK-NEXT: [[TMP40:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP39]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1 ; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP42:%.*]] = or disjoint i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] ; CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP44]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: +; CHECK: pred.load.continue15: ; CHECK-NEXT: [[TMP46:%.*]] = phi <4 x i32> [ [[TMP40]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP45]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2 ; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP48:%.*]] = or disjoint i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4 ; CHECK-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP50]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: +; CHECK: pred.load.continue17: ; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP46]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP51]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3 ; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP54:%.*]] = or disjoint i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP54]] ; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4 ; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP56]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: +; CHECK: pred.load.continue19: ; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP52]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP57]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0 ; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP60:%.*]] = or disjoint i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP60]] ; CHECK-NEXT: [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i32> poison, i32 [[TMP62]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: +; CHECK: pred.load.continue21: ; CHECK-NEXT: [[TMP64:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE22]] ], [ [[TMP63]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1 ; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP66:%.*]] = or disjoint i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP66]] ; CHECK-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4 ; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP68]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: +; CHECK: pred.load.continue23: ; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ [[TMP64]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP69]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2 ; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP72:%.*]] = or disjoint i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP72]] ; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP74]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: +; CHECK: pred.load.continue25: ; CHECK-NEXT: [[TMP76:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP75]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3 ; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP78:%.*]] = or disjoint i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP78]] ; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4 ; CHECK-NEXT: [[TMP81:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP80]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: +; CHECK: pred.load.continue27: ; CHECK-NEXT: [[TMP82:%.*]] = phi <4 x i32> [ [[TMP76]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP81]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i1> [[TMP11]], i64 0 ; CHECK-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP84:%.*]] = or disjoint i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP84]] ; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: +; CHECK: pred.load.continue29: ; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP87]], [[PRED_LOAD_IF31]] ] ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP11]], i64 1 ; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP90:%.*]] = or disjoint i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP90]] ; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4 ; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP92]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: +; CHECK: pred.load.continue31: ; CHECK-NEXT: [[TMP94:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP93]], [[PRED_LOAD_IF33]] ] ; CHECK-NEXT: [[TMP95:%.*]] = extractelement <4 x i1> [[TMP11]], i64 2 ; CHECK-NEXT: br i1 [[TMP95]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK: pred.load.if35: +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP96:%.*]] = or disjoint i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP96]] ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP98]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: +; CHECK: pred.load.continue33: ; CHECK-NEXT: [[TMP100:%.*]] = phi <4 x i32> [ [[TMP94]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP99]], [[PRED_LOAD_IF35]] ] ; CHECK-NEXT: [[TMP101:%.*]] = extractelement <4 x i1> [[TMP11]], i64 3 ; CHECK-NEXT: br i1 [[TMP101]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.if37: +; CHECK: pred.load.if34: ; CHECK-NEXT: [[TMP102:%.*]] = or disjoint i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP102]] ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP104]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.continue38: +; CHECK: pred.load.continue35: ; CHECK-NEXT: [[TMP106:%.*]] = phi <4 x i32> [ [[TMP100]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP105]], [[PRED_LOAD_IF37]] ] ; CHECK-NEXT: [[TMP107:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP34]], <4 x i32> ; CHECK-NEXT: [[TMP108:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP107]]) diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index 1b9f15a419ea3..33e776ea8e59f 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -21,9 +21,9 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -103,8 +103,8 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv1i64() ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv1i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index 2bb3c898c7cda..f59c6050bbb31 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -55,13 +55,20 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[PADD:%.+]]> = ptradd ir<%A>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[VPTR:%.]]> = vector-pointer vp<[[PADD]]> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR]]> +; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<1> +; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> +; CHECK-NEXT: EMIT vp<[[PADD2:%.+]]> = ptradd ir<%A>, vp<[[STEPS2]]> +; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> +; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> ; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> -; CHECK-NEXT: vp<[[VPTR2:%.+]]> = vector-pointer vp<[[PADD]]> -; CHECK-NEXT: WIDEN store vp<[[VPTR2]]>, ir<%add> +; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> +; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> +; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: No successors