diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c4582df89213d..781ef29cb5a41 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2416,15 +2416,16 @@ class BoUpSLP { } /// Go through the instructions in VL and append their operands. - void appendOperandsOfVL(ArrayRef VL, Instruction *VL0) { + void appendOperandsOfVL(ArrayRef VL, const InstructionsState &S) { assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); // IntrinsicInst::isCommutative returns true if swapping the first "two" // arguments to the intrinsic produces the same result. constexpr unsigned IntrinsicNumOperands = 2; - unsigned NumOperands = VL0->getNumOperands(); - ArgSize = isa(VL0) ? IntrinsicNumOperands : NumOperands; + unsigned NumOperands = S.getMainOp()->getNumOperands(); + ArgSize = isa(S.getMainOp()) ? IntrinsicNumOperands + : NumOperands; OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { @@ -2444,8 +2445,8 @@ class BoUpSLP { // tell the inverse operations by checking commutativity. if (isa(VL[Lane])) { OpsVec[OpIdx][Lane] = { - PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, - false}; + PoisonValue::get(S.getMainOp()->getOperand(OpIdx)->getType()), + true, false}; continue; } bool IsInverseOperation = !isCommutative(cast(VL[Lane])); @@ -2557,11 +2558,12 @@ class BoUpSLP { public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, Instruction *VL0, const BoUpSLP &R) + VLOperands(ArrayRef RootVL, const InstructionsState &S, + const BoUpSLP &R) : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), - L(R.LI->getLoopFor((VL0->getParent()))) { + L(R.LI->getLoopFor(S.getMainOp()->getParent())) { // Append all the operands of RootVL. - appendOperandsOfVL(RootVL, VL0); + appendOperandsOfVL(RootVL, S); } /// \Returns a value vector with the operands across all lanes for the @@ -3034,7 +3036,7 @@ class BoUpSLP { /// non-identity permutation that allows to reuse extract instructions. /// \param ResizeAllowed indicates whether it is allowed to handle subvector /// extract order. - bool canReuseExtract(ArrayRef VL, Value *OpValue, + bool canReuseExtract(ArrayRef VL, SmallVectorImpl &CurrentOrder, bool ResizeAllowed = false) const; @@ -3261,7 +3263,7 @@ class BoUpSLP { }; /// Checks if the current node is a gather node. - bool isGather() const {return State == NeedToGather; } + bool isGather() const { return State == NeedToGather; } /// A vector of scalars. ValueList Scalars; @@ -3325,9 +3327,9 @@ class BoUpSLP { /// reordering of operands during buildTree_rec() and vectorizeTree(). SmallVector Operands; - /// The main/alternate instruction. - Instruction *MainOp = nullptr; - Instruction *AltOp = nullptr; + /// MainOp and AltOp are recorded inside. S should be obtained from + /// newTreeEntry. + InstructionsState S = InstructionsState::invalid(); /// Interleaving factor for interleaved loads Vectorize nodes. unsigned InterleaveFactor = 0; @@ -3351,10 +3353,10 @@ class BoUpSLP { /// Set this bundle's operand from Scalars. void setOperand(const BoUpSLP &R, bool RequireReorder = false) { - VLOperands Ops(Scalars, MainOp, R); + VLOperands Ops(Scalars, S, R); if (RequireReorder) Ops.reorder(); - for (unsigned I : seq(MainOp->getNumOperands())) + for (unsigned I : seq(S.getMainOp()->getNumOperands())) setOperand(I, Ops.getVL(I)); } @@ -3387,13 +3389,9 @@ class BoUpSLP { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return MainOp != AltOp; } + bool isAltShuffle() const { return S.isAltShuffle(); } - bool isOpcodeOrAlt(Instruction *I) const { - unsigned CheckedOpcode = I->getOpcode(); - return (getOpcode() == CheckedOpcode || - getAltOpcode() == CheckedOpcode); - } + bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is @@ -3402,31 +3400,24 @@ class BoUpSLP { auto *I = dyn_cast(Op); if (I && isOpcodeOrAlt(I)) return Op; - return MainOp; + return S.getMainOp(); } void setOperations(const InstructionsState &S) { assert(S && "InstructionsState is invalid."); - MainOp = S.getMainOp(); - AltOp = S.getAltOp(); + this->S = S; } - Instruction *getMainOp() const { - return MainOp; - } + Instruction *getMainOp() const { return S.getMainOp(); } - Instruction *getAltOp() const { - return AltOp; - } + Instruction *getAltOp() const { return S.getAltOp(); } /// The main/alternate opcodes for the list of instructions. - unsigned getOpcode() const { - return MainOp ? MainOp->getOpcode() : 0; - } + unsigned getOpcode() const { return S.getOpcode(); } - unsigned getAltOpcode() const { - return AltOp ? AltOp->getOpcode() : 0; - } + unsigned getAltOpcode() const { return S.getAltOpcode(); } + + bool hasState() const { return S.valid(); } /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. @@ -3522,16 +3513,13 @@ class BoUpSLP { dbgs() << "CombinedVectorize\n"; break; } - dbgs() << "MainOp: "; - if (MainOp) - dbgs() << *MainOp << "\n"; - else - dbgs() << "NULL\n"; - dbgs() << "AltOp: "; - if (AltOp) - dbgs() << *AltOp << "\n"; - else - dbgs() << "NULL\n"; + if (S) { + dbgs() << "MainOp: " << *S.getMainOp() << "\n"; + dbgs() << "AltOp: " << *S.getAltOp() << "\n"; + } else { + dbgs() << "MainOp: NULL\n"; + dbgs() << "AltOp: NULL\n"; + } dbgs() << "VectorizedValue: "; if (VectorizedValue) dbgs() << *VectorizedValue << "\n"; @@ -3706,9 +3694,13 @@ class BoUpSLP { } #endif - TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } + TreeEntry *getTreeEntry(Value *V) { + assert(V && "V cannot be nullptr."); + return ScalarToTreeEntry.lookup(V); + } const TreeEntry *getTreeEntry(Value *V) const { + assert(V && "V cannot be nullptr."); return ScalarToTreeEntry.lookup(V); } @@ -5559,7 +5551,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // Try build correct order for extractelement instructions. SmallVector ReusedMask(TE.ReuseShuffleIndices.begin(), TE.ReuseShuffleIndices.end()); - if (TE.getOpcode() == Instruction::ExtractElement && + if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement && all_of(TE.Scalars, [Sz](Value *V) { if (isa(V)) return true; @@ -5721,10 +5713,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return std::nullopt; // No need to reorder. return std::move(Phis); } - if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) { + if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) && + allSameType(TE.Scalars)) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. - if ((TE.getOpcode() == Instruction::ExtractElement || + if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) || (all_of(TE.Scalars, IsaPred) && any_of(TE.Scalars, IsaPred))) && all_of(TE.Scalars, [](Value *V) { @@ -5734,8 +5727,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, - /*ResizeAllowed=*/true); + bool Reuse = + canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true); if (Reuse || !CurrentOrder.empty()) return std::move(CurrentOrder); } @@ -5784,7 +5777,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return Order; // Check if can include the order of vectorized loads. For masked gathers do // extra analysis later, so include such nodes into a special list. - if (TE.isGather() && TE.getOpcode() == Instruction::Load) { + if (TE.hasState() && TE.getOpcode() == Instruction::Load) { SmallVector PointerOps; OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), @@ -5899,7 +5892,7 @@ void BoUpSLP::reorderTopToBottom() { // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->isAltShuffle()) { + if (TE->hasState() && TE->isAltShuffle()) { VectorType *VecTy = getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); @@ -5978,7 +5971,7 @@ void BoUpSLP::reorderTopToBottom() { if (It != GathersToOrders.end()) return It->second; } - if (OpTE->isAltShuffle()) { + if (OpTE->hasState() && OpTE->isAltShuffle()) { auto It = AltShufflesToOrders.find(OpTE); if (It != AltShufflesToOrders.end()) return It->second; @@ -7579,7 +7572,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + bool Reuse = canReuseExtract(VL, CurrentOrder); // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. if (!has_single_bit(VL.size())) return TreeEntry::NeedToGather; @@ -8592,7 +8585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, TE->dump()); ValueList Left, Right; - VLOperands Ops(VL, VL0, *this); + VLOperands Ops(VL, S, *this); if (cast(VL0)->isCommutative()) { // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. @@ -8860,7 +8853,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { return N; } -bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, +bool BoUpSLP::canReuseExtract(ArrayRef VL, SmallVectorImpl &CurrentOrder, bool ResizeAllowed) const { const auto *It = find_if(VL, IsaPred); @@ -9514,7 +9507,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { // Do not reorder nodes if it small (just 2 elements), all-constant or all // instructions have same opcode already. - if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) || + if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) || all_of(TE.Scalars, isConstant)) return; @@ -9733,7 +9726,7 @@ void BoUpSLP::transformNodes() { // Do not try partial vectorization for small nodes (<= 2), nodes with the // same opcode and same parent block or all constants. if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || - !(!E.getOpcode() || E.getOpcode() == Instruction::Load || + !(!E.hasState() || E.getOpcode() == Instruction::Load || E.isAltShuffle() || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; @@ -9876,6 +9869,8 @@ void BoUpSLP::transformNodes() { E.ReorderIndices.clear(); } } + if (!E.hasState()) + continue; switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -9995,7 +9990,7 @@ void BoUpSLP::transformNodes() { getCanonicalGraphSize() <= SmallTree && count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), [](const std::unique_ptr &TE) { - return TE->isGather() && + return TE->isGather() && TE->hasState() && TE->getOpcode() == Instruction::Load && !allSameBlock(TE->Scalars); }) == 1) @@ -10011,13 +10006,13 @@ void BoUpSLP::transformNodes() { for (std::unique_ptr &TE : VectorizableTree) { TreeEntry &E = *TE; if (E.isGather() && - (E.getOpcode() == Instruction::Load || - (!E.getOpcode() && any_of(E.Scalars, - [&](Value *V) { - return isa(V) && - !isVectorized(V) && - !isDeleted(cast(V)); - }))) && + ((E.hasState() && E.getOpcode() == Instruction::Load) || + (!E.hasState() && any_of(E.Scalars, + [&](Value *V) { + return isa(V) && + !isVectorized(V) && + !isDeleted(cast(V)); + }))) && !isSplat(E.Scalars)) { for (Value *V : E.Scalars) { auto *LI = dyn_cast(V); @@ -10611,7 +10606,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool PrevNodeFound = any_of( ArrayRef(R.VectorizableTree).take_front(E->Idx), [&](const std::unique_ptr &TE) { - return ((!TE->isAltShuffle() && + return ((TE->hasState() && !TE->isAltShuffle() && TE->getOpcode() == Instruction::ExtractElement) || TE->isGather()) && all_of(enumerate(TE->Scalars), [&](auto &&Data) { @@ -11725,7 +11720,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, for (const std::unique_ptr &TE : VectorizableTree) { if (TE.get() == E) break; - if (TE->isAltShuffle() && + if (TE->hasState() && TE->isAltShuffle() && ((TE->getOpcode() == E->getOpcode() && TE->getAltOpcode() == E->getAltOpcode()) || (TE->getOpcode() == E->getAltOpcode() && @@ -11887,10 +11882,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { [this](Value *V) { return EphValues.contains(V); }) && (allConstant(TE->Scalars) || isSplat(TE->Scalars) || TE->Scalars.size() < Limit || - ((TE->getOpcode() == Instruction::ExtractElement || + (((TE->hasState() && + TE->getOpcode() == Instruction::ExtractElement) || all_of(TE->Scalars, IsaPred)) && isFixedVectorShuffle(TE->Scalars, Mask, AC)) || - (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || + ((TE->hasState() && TE->getOpcode() == Instruction::Load) && + (!TE->hasState() || !TE->isAltShuffle())) || any_of(TE->Scalars, IsaPred)); }; @@ -12019,9 +12016,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { !VectorizableTree.empty() && all_of(VectorizableTree, [&](const std::unique_ptr &TE) { return (TE->isGather() && - TE->getOpcode() != Instruction::ExtractElement && + (!TE->hasState() || + TE->getOpcode() != Instruction::ExtractElement) && count_if(TE->Scalars, IsaPred) <= Limit) || - TE->getOpcode() == Instruction::PHI; + (TE->hasState() && TE->getOpcode() == Instruction::PHI); })) return true; @@ -12055,6 +12053,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return false; if (VectorizableTree.back()->isGather() && + VectorizableTree.back()->hasState() && VectorizableTree.back()->isAltShuffle() && VectorizableTree.back()->getVectorFactor() > 2 && allSameBlock(VectorizableTree.back()->Scalars) && @@ -12079,7 +12078,7 @@ bool BoUpSLP::isTreeNotExtendable() const { getCanonicalGraphSize() <= SmallTree && count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), [](const std::unique_ptr &TE) { - return TE->isGather() && + return TE->isGather() && TE->hasState() && TE->getOpcode() == Instruction::Load && !allSameBlock(TE->Scalars); }) == 1) @@ -12091,7 +12090,7 @@ bool BoUpSLP::isTreeNotExtendable() const { TreeEntry &E = *VectorizableTree[Idx]; if (!E.isGather()) continue; - if (E.getOpcode() && E.getOpcode() != Instruction::Load) + if (E.hasState() && E.getOpcode() != Instruction::Load) return false; if (isSplat(E.Scalars) || allConstant(E.Scalars)) continue; @@ -12401,7 +12400,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); continue; } - if (TE.isGather()) { + if (TE.isGather() && TE.hasState()) { if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); E && E->getVectorFactor() == TE.getVectorFactor() && E->isSame(TE.Scalars)) { @@ -14846,14 +14845,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } // Gather extracts after we check for full matched gathers only. - if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || - ((E->getOpcode() == Instruction::Load || + if (!ExtractShuffles.empty() || !E->hasState() || + E->getOpcode() != Instruction::Load || + (((E->hasState() && E->getOpcode() == Instruction::Load) || any_of(E->Scalars, IsaPred)) && any_of(E->Scalars, [this](Value *V) { return isa(V) && getTreeEntry(V); })) || - E->isAltShuffle() || + (E->hasState() && E->isAltShuffle()) || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { @@ -15233,7 +15233,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size()); if (E->isGather()) { // Set insert point for non-reduction initial nodes. - if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) + if (E->hasState() && E->Idx == 0 && !UserIgnoreList) setInsertPointAfterBundle(E); Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs); E->VectorizedValue = Vec; @@ -18122,10 +18122,9 @@ void BoUpSLP::computeMinimumValueSizes() { return; SmallVector ToDemote; - auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, - bool IsProfitableToDemoteRoot, unsigned Opcode, - unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) -> unsigned { + auto ComputeMaxBitWidth = + [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, + unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned { ToDemote.clear(); // Check if the root is trunc and the next node is gather/buildvector, then // keep trunc in scalars, which is free in most cases. @@ -18166,11 +18165,14 @@ void BoUpSLP::computeMinimumValueSizes() { return MaxBitWidth; } + if (!E.hasState()) + return 0u; + unsigned VF = E.getVectorFactor(); Type *ScalarTy = E.Scalars.front()->getType(); unsigned ScalarTyNumElements = getNumElements(ScalarTy); auto *TreeRootIT = dyn_cast(ScalarTy->getScalarType()); - if (!TreeRootIT || !Opcode) + if (!TreeRootIT) return 0u; if (any_of(E.Scalars, @@ -18242,6 +18244,7 @@ void BoUpSLP::computeMinimumValueSizes() { IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF))) return 0u; + unsigned Opcode = E.getOpcode(); bool IsProfitableToDemote = Opcode == Instruction::Trunc || Opcode == Instruction::SExt || Opcode == Instruction::ZExt || NumParts > 1; @@ -18322,15 +18325,14 @@ void BoUpSLP::computeMinimumValueSizes() { while (NodeIdx < VectorizableTree.size()) { ArrayRef TreeRoot = VectorizableTree[NodeIdx]->Scalars; unsigned Limit = 2; - unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode(); if (IsTopRoot && ReductionBitWidth == DL->getTypeSizeInBits( VectorizableTree.front()->Scalars.front()->getType())) Limit = 3; unsigned MaxBitWidth = ComputeMaxBitWidth( - *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode, - Limit, IsTruncRoot, IsSignedCmp); + *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit, + IsTruncRoot, IsSignedCmp); if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) { if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth) ReductionBitWidth = bit_ceil(MaxBitWidth); @@ -18373,19 +18375,21 @@ void BoUpSLP::computeMinimumValueSizes() { }); IsSignedCmp = NodeIdx < VectorizableTree.size() && - any_of(VectorizableTree[NodeIdx]->UserTreeIndices, - [&](const EdgeInfo &EI) { - return EI.UserTE->getOpcode() == Instruction::ICmp && - any_of(EI.UserTE->Scalars, [&](Value *V) { - auto *IC = dyn_cast(V); - return IC && - (IC->isSigned() || - !isKnownNonNegative(IC->getOperand(0), - SimplifyQuery(*DL)) || - !isKnownNonNegative(IC->getOperand(1), - SimplifyQuery(*DL))); - }); - }); + any_of( + VectorizableTree[NodeIdx]->UserTreeIndices, + [&](const EdgeInfo &EI) { + return (EI.UserTE->hasState() && + EI.UserTE->getOpcode() == Instruction::ICmp) && + any_of(EI.UserTE->Scalars, [&](Value *V) { + auto *IC = dyn_cast(V); + return IC && + (IC->isSigned() || + !isKnownNonNegative(IC->getOperand(0), + SimplifyQuery(*DL)) || + !isKnownNonNegative(IC->getOperand(1), + SimplifyQuery(*DL))); + }); + }); } // If the maximum bit width we compute is less than the width of the roots'