diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 3081379bafd06..4d311e7e9fd6a 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1771,6 +1771,10 @@ class TargetTransformInfo { /// scalable version of the vectorized loop. bool preferFixedOverScalableIfEqualCost() const; + /// \returns True if target prefers SLP vectorizer with altermate opcode + /// vectorization, false - otherwise. + bool preferAlternateOpcodeVectorization() const; + /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty) const; @@ -2325,6 +2329,7 @@ class TargetTransformInfo::Concept { virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const = 0; + virtual bool preferAlternateOpcodeVectorization() const = 0; virtual bool preferEpilogueVectorization() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; @@ -3135,6 +3140,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool preferInLoopReduction(unsigned Opcode, Type *Ty) const override { return Impl.preferInLoopReduction(Opcode, Ty); } + bool preferAlternateOpcodeVectorization() const override { + return Impl.preferAlternateOpcodeVectorization(); + } bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const override { return Impl.preferPredicatedReductionSelect(Opcode, Ty); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63fe7debfb8c7..c15694916a732 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1007,6 +1007,7 @@ class TargetTransformInfoImplBase { bool preferFixedOverScalableIfEqualCost() const { return false; } bool preferInLoopReduction(unsigned Opcode, Type *Ty) const { return false; } + bool preferAlternateOpcodeVectorization() const { return true; } bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3d43f39439625..36f2983390a48 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1384,6 +1384,10 @@ bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, return TTIImpl->preferInLoopReduction(Opcode, Ty); } +bool TargetTransformInfo::preferAlternateOpcodeVectorization() const { + return TTIImpl->preferAlternateOpcodeVectorization(); +} + bool TargetTransformInfo::preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const { return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 020a2b8d4edfb..ac46db5faf28d 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -125,6 +125,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + bool preferAlternateOpcodeVectorization() const { return false; } + bool preferEpilogueVectorization() const { // Epilogue vectorization is usually unprofitable - tail folding or // a smaller VF would have been better. This a blunt hammer - we diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 8fcaee0c7017f..c916da7f275d7 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -292,6 +292,7 @@ class X86TTIImpl : public BasicTTIImplBase { TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; + bool preferAlternateOpcodeVectorization() const { return false; } bool prefersVectorizedAddressing() const; bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4decf5bec9514..a9f61d7a9798a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -141,6 +141,10 @@ static cl::opt ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt SplitAlternateInstructions( + "slp-split-alternate-instructions", cl::init(true), cl::Hidden, + cl::desc("Improve the code quality by splitting alternate instructions")); + static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -840,6 +844,35 @@ class InstructionsState { return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } + /// Checks if main/alt instructions are shift operations. + bool isShiftOp() const { + return getMainOp()->isShift() && getAltOp()->isShift(); + } + + /// Checks if main/alt instructions are bitwise logic operations. + bool isBitwiseLogicOp() const { + return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp(); + } + + /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations. + bool isMulDivLikeOp() const { + constexpr std::array MulDiv = { + Instruction::Mul, Instruction::FMul, Instruction::SDiv, + Instruction::UDiv, Instruction::FDiv, Instruction::SRem, + Instruction::URem, Instruction::FRem}; + return is_contained(MulDiv, getOpcode()) && + is_contained(MulDiv, getAltOpcode()); + } + + /// Checks if main/alt instructions are add/sub/fadd/fsub operations. + bool isAddSubLikeOp() const { + constexpr std::array AddSub = { + Instruction::Add, Instruction::Sub, Instruction::FAdd, + Instruction::FSub}; + return is_contained(AddSub, getOpcode()) && + is_contained(AddSub, getAltOpcode()); + } + /// Checks if the current state is valid, i.e. has non-null MainOp bool valid() const { return MainOp && AltOp; } @@ -1472,6 +1505,7 @@ class BoUpSLP { void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntries.clear(); + ScalarsInSplitNodes.clear(); MustGather.clear(); NonScheduledFirst.clear(); EntryToLastInstruction.clear(); @@ -1507,7 +1541,7 @@ class BoUpSLP { /// should be represented as an empty order, so this is used to /// decide if we can canonicalize a computed order. Undef elements /// (represented as size) are ignored. - bool isIdentityOrder(ArrayRef Order) const { + static bool isIdentityOrder(ArrayRef Order) { assert(!Order.empty() && "expected non-empty order"); const unsigned Sz = Order.size(); return all_of(enumerate(Order), [&](const auto &P) { @@ -3229,12 +3263,35 @@ class BoUpSLP { /// \returns Common mask for reorder indices and reused scalars. SmallVector getCommonMask() const { + if (State == TreeEntry::SplitVectorize) + return {}; SmallVector Mask; inversePermutation(ReorderIndices, Mask); ::addMask(Mask, ReuseShuffleIndices); return Mask; } + /// \returns The mask for split nodes. + SmallVector getSplitMask() const { + assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() && + "Expected only split vectorize node."); + SmallVector Mask(getVectorFactor(), PoisonMaskElem); + unsigned CommonVF = std::max( + CombinedEntriesWithIndices.back().second, + Scalars.size() - CombinedEntriesWithIndices.back().second); + for (auto [Idx, I] : enumerate(ReorderIndices)) + Mask[I] = + Idx + (Idx >= CombinedEntriesWithIndices.back().second + ? CommonVF - CombinedEntriesWithIndices.back().second + : 0); + return Mask; + } + + /// Updates (reorders) SplitVectorize node according to the given mask \p + /// Mask and order \p MaskOrder. + void reorderSplitNode(unsigned Idx, ArrayRef Mask, + ArrayRef MaskOrder); + /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { auto &&IsSame = [VL](ArrayRef Scalars, ArrayRef Mask) { @@ -3322,6 +3379,8 @@ class BoUpSLP { ///< complex node like select/cmp to minmax, mul/add to ///< fma, etc. Must be used for the following nodes in ///< the pattern, not the very first one. + SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them + ///< independently and then combines back. }; EntryState State; @@ -3352,7 +3411,7 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. unsigned Idx = 0; - /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// For gather/buildvector/alt opcode nodes, which are combined from /// other nodes as a series of insertvector instructions. SmallVector, 2> CombinedEntriesWithIndices; @@ -3547,6 +3606,9 @@ class BoUpSLP { case CombinedVectorize: dbgs() << "CombinedVectorize\n"; break; + case SplitVectorize: + dbgs() << "SplitVectorize\n"; + break; } if (S) { dbgs() << "MainOp: " << *S.getMainOp() << "\n"; @@ -3627,8 +3689,10 @@ class BoUpSLP { const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = {}, ArrayRef ReorderIndices = {}) { - assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || - (Bundle && EntryState != TreeEntry::NeedToGather)) && + assert(((!Bundle && (EntryState == TreeEntry::NeedToGather || + EntryState == TreeEntry::SplitVectorize)) || + (Bundle && EntryState != TreeEntry::NeedToGather && + EntryState != TreeEntry::SplitVectorize)) && "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && @@ -3662,11 +3726,38 @@ class BoUpSLP { return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - if (S) + if (S) { Last->setOperations(S); + } else if (EntryState == TreeEntry::SplitVectorize) { + auto *MainOp = + cast(*find_if(Last->Scalars, IsaPred)); + auto *AltOp = cast(*find_if(Last->Scalars, [=](Value *V) { + auto *I = dyn_cast(V); + return I && I->getOpcode() != MainOp->getOpcode(); + })); + Last->setOperations(InstructionsState(MainOp, AltOp)); + } + if (EntryState == TreeEntry::SplitVectorize) { + SmallPtrSet Processed; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + auto It = ScalarsInSplitNodes.find(V); + if (It == ScalarsInSplitNodes.end()) { + ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back( + Last); + (void)Processed.insert(V); + } else if (Processed.insert(V).second) { + assert(!is_contained(It->getSecond(), Last) && + "Value already associated with the node."); + It->getSecond().push_back(Last); + } + } + } Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } - if (!Last->isGather()) { + if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) { SmallPtrSet Processed; for (Value *V : VL) { if (isa(V)) @@ -3703,7 +3794,7 @@ class BoUpSLP { } } assert(!BundleMember && "Bundle and VL out of sync"); - } else { + } else if (Last->isGather()) { // Build a map for gathered scalars to the nodes where they are used. bool AllConstsOrCasts = true; for (Value *V : VL) @@ -3748,6 +3839,15 @@ class BoUpSLP { return It->getSecond(); } + /// Get list of split vector entries, associated with the value \p V. + ArrayRef getSplitTreeEntries(Value *V) const { + assert(V && "V cannot be nullptr."); + auto It = ScalarsInSplitNodes.find(V); + if (It == ScalarsInSplitNodes.end()) + return {}; + return It->getSecond(); + } + /// Returns first vector node for value \p V, matching values \p VL. TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef VL, bool SameVF = false) const { @@ -3778,6 +3878,9 @@ class BoUpSLP { /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; + /// Scalars, used in split vectorize nodes. + SmallDenseMap> ScalarsInSplitNodes; + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; @@ -5764,12 +5867,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; - if ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && - (isa(TE.getMainOp()) || - (TopToBottom && isa(TE.getMainOp())))) { - assert(!TE.isAltShuffle() && "Alternate instructions are only supported by " - "BinaryOperator and CastInst."); + if (TE.State == TreeEntry::SplitVectorize || + ((TE.State == TreeEntry::Vectorize || + TE.State == TreeEntry::StridedVectorize) && + (isa(TE.getMainOp()) || + (TopToBottom && isa(TE.getMainOp()))))) { + assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && + "Alternate instructions are only supported by " + "BinaryOperator and CastInst."); return TE.ReorderIndices; } if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { @@ -5880,7 +5985,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; // No need to reorder. return std::move(Phis); } - if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) && + if (TE.isGather() && + (!TE.hasState() || !TE.isAltShuffle() || + ScalarsInSplitNodes.contains(TE.getMainOp())) && allSameType(TE.Scalars)) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. @@ -6088,6 +6195,30 @@ bool BoUpSLP::isProfitableToReorder() const { return true; } +void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef Mask, + ArrayRef MaskOrder) { + assert(State == TreeEntry::SplitVectorize && "Expected split user node."); + SmallVector NewMask(getVectorFactor()); + SmallVector NewMaskOrder(getVectorFactor()); + std::iota(NewMask.begin(), NewMask.end(), 0); + std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0); + if (Idx == 0) { + copy(Mask, NewMask.begin()); + copy(MaskOrder, NewMaskOrder.begin()); + } else { + assert(Idx == 1 && "Expected either 0 or 1 index."); + unsigned Offset = CombinedEntriesWithIndices.back().second; + for (unsigned I : seq(Mask.size())) { + NewMask[I + Offset] = Mask[I] + Offset; + NewMaskOrder[I + Offset] = MaskOrder[I] + Offset; + } + } + reorderScalars(Scalars, NewMask); + reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true); + if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -6122,7 +6253,8 @@ void BoUpSLP::reorderTopToBottom() { // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->hasState() && TE->isAltShuffle()) { + if (TE->hasState() && TE->isAltShuffle() && + TE->State != TreeEntry::SplitVectorize) { VectorType *VecTy = getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); @@ -6163,7 +6295,8 @@ void BoUpSLP::reorderTopToBottom() { } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6194,7 +6327,8 @@ void BoUpSLP::reorderTopToBottom() { for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) && + OpTE->State != TreeEntry::SplitVectorize) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, @@ -6301,14 +6435,17 @@ void BoUpSLP::reorderTopToBottom() { // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { + assert(TE->State != TreeEntry::SplitVectorize && + "Split vectorized not expected."); // Need to reorder the reuses masks of the operands with smaller VF to // be able to find the match between the graph nodes and scalar // operands of the given node during vectorization/cost estimation. - assert((!TE->UserTreeIndex || - TE->UserTreeIndex.UserTE->Scalars.size() == VF || - TE->UserTreeIndex.UserTE->Scalars.size() == - TE->Scalars.size()) && - "All users must be of VF size."); + assert( + (!TE->UserTreeIndex || + TE->UserTreeIndex.UserTE->Scalars.size() == VF || + TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() || + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) && + "All users must be of VF size."); if (SLPReVec) { assert(SLPReVec && "Only supported by REVEC."); // ShuffleVectorInst does not do reorderOperands (and it should not @@ -6325,19 +6462,28 @@ void BoUpSLP::reorderTopToBottom() { // Update ordering of the operands with the smaller VF than the given // one. reorderNodeWithReuses(*TE, Mask); + // Update orders in user split vectorize nodes. + if (TE->UserTreeIndex && + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) + TE->UserTreeIndex.UserTE->reorderSplitNode( + TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder); } continue; } - if ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && - (isa(TE->getMainOp()) || - (SLPReVec && isa(TE->getMainOp())))) { - assert(!TE->isAltShuffle() && - "Alternate instructions are only supported by BinaryOperator " - "and CastInst."); - // Build correct orders for extract{element,value}, loads and - // stores. + if ((TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty()) || + ((TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::StridedVectorize) && + (isa(TE->getMainOp()) || + (SLPReVec && isa(TE->getMainOp()))))) { + assert( + (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty())) && + "Alternate instructions are only supported by BinaryOperator " + "and CastInst."); + // Build correct orders for extract{element,value}, loads, + // stores and alternate (split) nodes. reorderOrder(TE->ReorderIndices, Mask); if (isa(TE->getMainOp())) TE->reorderOperands(Mask); @@ -6358,6 +6504,11 @@ void BoUpSLP::reorderTopToBottom() { addMask(NewReuses, TE->ReuseShuffleIndices); TE->ReuseShuffleIndices.swap(NewReuses); } + // Update orders in user split vectorize nodes. + if (TE->UserTreeIndex && + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) + TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx, + Mask, MaskOrder); } } } @@ -6370,7 +6521,8 @@ bool BoUpSLP::canReorderOperands( if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || - OpData.second->State == TreeEntry::StridedVectorize); + OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::SplitVectorize); })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { @@ -6384,6 +6536,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); continue; @@ -6393,6 +6546,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && TE->UserTreeIndex.EdgeIdx == I) { @@ -6412,7 +6566,14 @@ bool BoUpSLP::canReorderOperands( } void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { - SetVector OrderedEntries; + struct TreeEntryCompare { + bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const { + if (LHS->UserTreeIndex && RHS->UserTreeIndex) + return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx; + return LHS->Idx < RHS->Idx; + } + }; + PriorityQueue, TreeEntryCompare> Queue; DenseSet GathersToOrders; // Find all reorderable leaf nodes with the given VF. // Currently the are vectorized loads,extracts without alternate operands + @@ -6420,13 +6581,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector NonVectorized; for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::StridedVectorize) + TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) { - OrderedEntries.insert(TE.get()); + Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); } @@ -6437,40 +6600,88 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // one operand order in the natural order and reorder others + reorder the // user node itself. SmallPtrSet Visited, RevisitedOps; - while (!OrderedEntries.empty()) { + while (!Queue.empty()) { // 1. Filter out only reordered nodes. - DenseMap>> Users; - SmallVector Filtered; - for (TreeEntry *TE : OrderedEntries) { + std::pair>> Users; + TreeEntry *TE = Queue.top(); + const TreeEntry *UserTE = TE->UserTreeIndex.UserTE; + Queue.pop(); + SmallVector OrderedOps(1, TE); + while (!Queue.empty()) { + TE = Queue.top(); + if (!UserTE || UserTE != TE->UserTreeIndex.UserTE) + break; + Queue.pop(); + OrderedOps.push_back(TE); + } + for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || - !Visited.insert(TE).second) { - Filtered.push_back(TE); + !Visited.insert(TE).second) continue; - } // Build a map between user nodes and their operands order to speedup // search. The graph currently does not provide this dependency directly. - Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx, - TE); - } - // Erase filtered entries. - for (TreeEntry *TE : Filtered) - OrderedEntries.remove(TE); - SmallVector< - std::pair>>> - UsersVec(Users.begin(), Users.end()); - sort(UsersVec, [](const auto &Data1, const auto &Data2) { - return Data1.first->Idx > Data2.first->Idx; - }); - for (auto &Data : UsersVec) { + Users.first = TE->UserTreeIndex.UserTE; + Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE); + } + if (Users.first) { + auto &Data = Users; + if (Data.first->State == TreeEntry::SplitVectorize) { + assert( + Data.second.size() <= 2 && + "Expected not greater than 2 operands for split vectorize node."); + if (any_of(Data.second, + [](const auto &Op) { return !Op.second->UserTreeIndex; })) + continue; + // Update orders in user split vectorize nodes. + assert(Data.first->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 entries."); + for (const auto &P : Data.first->CombinedEntriesWithIndices) { + TreeEntry &OpTE = *VectorizableTree[P.first].get(); + OrdersType Order = OpTE.ReorderIndices; + if (Order.empty()) { + if (!OpTE.isGather()) + continue; + const auto BestOrder = + getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder); + if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder)) + continue; + Order = *BestOrder; + } + fixupOrderingIndices(Order); + SmallVector Mask; + inversePermutation(Order, Mask); + const unsigned E = Order.size(); + SmallVector MaskOrder(E, PoisonMaskElem); + transform(Order, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : PoisonMaskElem; + }); + Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder); + // Clear ordering of the operand. + if (!OpTE.ReorderIndices.empty()) { + OpTE.ReorderIndices.clear(); + } else { + assert(OpTE.isGather() && "Expected only gather/buildvector node."); + reorderScalars(OpTE.Scalars, Mask); + } + } + if (Data.first->ReuseShuffleIndices.empty() && + !Data.first->ReorderIndices.empty()) { + // Insert user node to the list to try to sink reordering deeper in + // the graph. + Queue.push(Data.first); + } + continue; + } // Check that operands are used only in the User node. SmallVector GatherOps; if (!canReorderOperands(Data.first, Data.second, NonVectorized, GatherOps)) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } // All operands are reordered and used only in this node - propagate the @@ -6563,6 +6774,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { UTE->UserTreeIndex.UserTE == Data.first) || (Data.first->UserTreeIndex && Data.first->UserTreeIndex.UserTE == UTE) || + (IgnoreReorder && UTE->UserTreeIndex && + UTE->UserTreeIndex.UserTE->Idx == 0) || NodeShouldBeReorderedWithOperands(UTE); })) continue; @@ -6576,7 +6789,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; const TreeEntry *Op = getOperandEntry(UTE, Idx); Visited.erase(Op); - OrderedEntries.insert(const_cast(Op)); + Queue.push(const_cast(Op)); } } } @@ -6633,7 +6846,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // the compile time. // Profitable to reorder if definitely more operands allow // reordering rather than those with natural order. - ArrayRef> Ops = Users[UserTE]; + ArrayRef> Ops = Users.second; if (static_cast(count_if( Ops, [UserTE, &AllowsReordering]( const std::pair &Op) { @@ -6645,7 +6858,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } if (OrdersUses.empty()) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } // Choose the most used order. @@ -6675,7 +6888,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Set order of the user node. if (isIdentityOrder(BestOrder)) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } fixupOrderingIndices(BestOrder); @@ -6690,7 +6903,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { }); for (const std::pair &Op : Data.second) { TreeEntry *TE = Op.second; - OrderedEntries.remove(TE); if (!VisitedOps.insert(TE).second) continue; if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { @@ -6700,6 +6912,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -6720,7 +6933,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; } reorderScalars(Gather->Scalars, Mask); - OrderedEntries.remove(Gather); + Visited.insert(Gather); } // Reorder operands of the user node and set the ordering for the user // node itself. @@ -6740,7 +6953,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { !Data.first->isAltShuffle()) { // Insert user node to the list to try to sink reordering deeper in // the graph. - OrderedEntries.insert(Data.first); + Queue.push(Data.first); } } else { reorderOrder(Data.first->ReorderIndices, Mask); @@ -6770,7 +6983,7 @@ void BoUpSLP::buildExternalUses( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; // For each lane: @@ -8379,6 +8592,48 @@ class PHIHandler { }; } // namespace +/// Returns main/alternate instructions for the given \p VL. Unlike +/// getSameOpcode supports non-compatible instructions for better SplitVectorize +/// node support. +/// \returns first main/alt instructions, if only poisons and instruction with +/// only 2 opcodes exists. Returns pair of nullptr otherwise. +static std::pair +getMainAltOpsNoStateVL(ArrayRef VL) { + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + for (Value *V : VL) { + if (isa(V)) + continue; + auto *I = dyn_cast(V); + if (!I) + return {}; + if (!MainOp) { + MainOp = I; + continue; + } + if (MainOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != MainOp->getParent()) + return {}; + continue; + } + if (!AltOp) { + AltOp = I; + continue; + } + if (AltOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != AltOp->getParent()) + return {}; + continue; + } + return {}; + } + if (!AltOp) + return {}; + assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() && + "Expected different main and alt instructions."); + return std::make_pair(MainOp, AltOp); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx, unsigned InterleaveFactor) { @@ -8529,6 +8784,146 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // Tries to build split node. + constexpr unsigned SmallNodeSize = 4; + auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize, + const InstructionsState &LocalState) { + if (VL.size() <= SmallNodeSize || + TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions) + return false; + + // Any value is used in split node already - just gather. + if (any_of(VL, [&](Value *V) { + return ScalarsInSplitNodes.contains(V) || isVectorized(V); + })) { + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return true; + } + SmallVector Op1, Op2; + OrdersType ReorderIndices(VL.size(), VL.size()); + SmallBitVector Op1Indices(VL.size()); + for (auto [Idx, V] : enumerate(VL)) { + auto *I = dyn_cast(V); + if (!I) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI); + if (NewS && !NewS.isAltShuffle()) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + Op2.push_back(V); + } + Type *ScalarTy = getValueType(VL.front()); + VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); + unsigned Opcode0 = LocalState.getOpcode(); + unsigned Opcode1 = LocalState.getAltOpcode(); + SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); + // Enable split node, only if all nodes do not form legal alternate + // instruction (like X86 addsub). + SmallPtrSet UOp1(Op1.begin(), Op1.end()); + SmallPtrSet UOp2(Op2.begin(), Op2.end()); + if (UOp1.size() <= 1 || UOp2.size() <= 1 || + TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) || + !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) || + !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size())) + return false; + // Enable split node, only if all nodes are power-of-2/full registers. + unsigned Op1Cnt = 0, Op2Cnt = Op1.size(); + for (unsigned Idx : seq(VL.size())) { + if (Op1Indices.test(Idx)) { + ReorderIndices[Op1Cnt] = Idx; + ++Op1Cnt; + } else { + ReorderIndices[Op2Cnt] = Idx; + ++Op2Cnt; + } + } + if (isIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + SmallVector Mask; + if (!ReorderIndices.empty()) + inversePermutation(ReorderIndices, Mask); + unsigned NumParts = TTI.getNumberOfParts(VecTy); + VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size()); + VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size()); + // Check non-profitable single register ops, which better to be represented + // as alternate ops. + if (NumParts >= VL.size()) + return false; + if ((LocalState.getMainOp()->isBinaryOp() && + LocalState.getAltOp()->isBinaryOp() && + (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() || + LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) || + (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) || + (LocalState.getMainOp()->isUnaryOp() && + LocalState.getAltOp()->isUnaryOp())) { + constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; + InstructionCost InsertCost = ::getShuffleCost( + TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy); + FixedVectorType *SubVecTy = + getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())); + InstructionCost NewShuffleCost = + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind); + if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost)) + return false; + InstructionCost OriginalVecOpsCost = + TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) + + TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind); + SmallVector OriginalMask(VL.size(), PoisonMaskElem); + for (unsigned Idx : seq(VL.size())) { + if (isa(VL[Idx])) + continue; + OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size()); + } + InstructionCost OriginalCost = + OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + VecTy, OriginalMask, Kind); + InstructionCost NewVecOpsCost = + TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) + + TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind); + InstructionCost NewCost = + NewVecOpsCost + InsertCost + + (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::Store + ? NewShuffleCost + : 0); + // If not profitable to split - exit. + if (NewCost >= OriginalCost) + return false; + } + + SmallVector NewVL(VL.size()); + copy(Op1, NewVL.begin()); + copy(Op2, std::next(NewVL.begin(), Op1.size())); + auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt, + LocalState, UserTreeIdx, {}, ReorderIndices); + LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump()); + auto AddNode = [&](ArrayRef Op, unsigned Idx) { + InstructionsState S = getSameOpcode(Op, *TLI); + if (S && (isa(S.getMainOp()) || + getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) { + // Build gather node for loads, they will be gathered later. + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, S, + {TE, Idx}); + } else { + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + buildTree_rec(Op, Depth, {TE, Idx}); + } + }; + AddNode(Op1, 0); + AddNode(Op2, 1); + return true; + }; + // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. @@ -8614,6 +9009,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { + if (!S) { + auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); + // Last chance to try to vectorize alternate node. + if (MainOp && AltOp && + TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp))) + return; + } LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -8693,6 +9095,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // FIXME: investigate if there are profitable cases for VL.size() <= 4. + if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S)) + return; + // Check that every instruction appears once in this bundle. if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; @@ -8725,6 +9131,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); + // Last chance to try to vectorize alternate node. + if (S.isAltShuffle() && ReuseShuffleIndices.empty() && + TrySplitNode(SmallNodeSize, S)) + return; newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); NonScheduledFirst.insert(VL.front()); @@ -8869,6 +9279,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, TE->dump()); break; case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected loads state."); } @@ -10046,6 +10457,69 @@ void BoUpSLP::transformNodes() { reorderGatherNode(E); } + // Better to use full gathered loads analysis, if there are only 2 loads + // gathered nodes each having less than 16 elements. + constexpr unsigned VFLimit = 16; + bool ForceLoadGather = + count_if(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->isGather() && TE->hasState() && + TE->getOpcode() == Instruction::Load && + TE->getVectorFactor() < VFLimit; + }) == 2; + + // Checks if the scalars are used in other node. + auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef VL, + function_ref CheckContainer) { + return TE->isSame(VL) || all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + auto *I = dyn_cast(V); + if (!I) + return false; + return is_contained(TE->Scalars, I) || CheckContainer(I); + }); + }; + auto CheckForSameVectorNodes = [&](const TreeEntry &E) { + if (E.hasState()) { + if (ArrayRef TEs = getTreeEntries(E.getMainOp()); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, E.Scalars, [&](Value *V) { + ArrayRef VTEs = getTreeEntries(V); + return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + return true; + ; + if (ArrayRef TEs = getSplitTreeEntries(E.getMainOp()); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, E.Scalars, [&](Value *V) { + ArrayRef VTEs = getSplitTreeEntries(V); + return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + return true; + } else { + // Check if the gather node full copy of split node. + auto *It = find_if(E.Scalars, IsaPred); + if (It != E.Scalars.end()) { + if (ArrayRef TEs = getSplitTreeEntries(*It); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, E.Scalars, [&](Value *V) { + ArrayRef VTEs = getSplitTreeEntries(V); + return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + return true; + } + } + return false; + }; // The tree may grow here, so iterate over nodes, built before. for (unsigned Idx : seq(BaseGraphSize)) { TreeEntry &E = *VectorizableTree[Idx]; @@ -10060,6 +10534,11 @@ void BoUpSLP::transformNodes() { E.isAltShuffle() || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; + if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load) + continue; + // Check if the node is a copy of other vector nodes. + if (CheckForSameVectorNodes(E)) + continue; // Try to find vectorizable sequences and transform them into a series of // insertvector instructions. unsigned StartIdx = 0; @@ -11293,7 +11772,8 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, } const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), [&](const std::unique_ptr &TE) { - return TE->isGather() && + return (TE->isGather() || + TE->State == TreeEntry::SplitVectorize) && TE->UserTreeIndex.EdgeIdx == Idx && TE->UserTreeIndex.UserTE == E; }); @@ -11351,6 +11831,32 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return processBuildVector( E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask."); + InstructionCost VectorCost = 0; + if (E->ReorderIndices.empty()) { + VectorCost = ::getShuffleCost( + *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind, + E->CombinedEntriesWithIndices.back().second, + getWidenedType( + ScalarTy, + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor())); + } else { + unsigned CommonVF = + std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first] + ->getVectorFactor(), + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor()); + VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonVF), + E->getSplitMask(), CostKind); + } + LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree")); + return VectorCost; + } InstructionCost CommonCost = 0; SmallVector Mask; if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || @@ -11432,7 +11938,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, EI.EdgeIdx != 0) { auto UserBWIt = MinBWs.find(EI.UserTE); Type *UserScalarTy = - EI.UserTE->isGather() + (EI.UserTE->isGather() || + EI.UserTE->State == TreeEntry::SplitVectorize) ? EI.UserTE->Scalars.front()->getType() : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); if (UserBWIt != MinBWs.end()) @@ -11935,6 +12442,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, break; } case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected vectorization state."); } @@ -12431,6 +12939,8 @@ bool BoUpSLP::isTreeNotExtendable() const { bool Res = false; for (unsigned Idx : seq(getTreeSize())) { TreeEntry &E = *VectorizableTree[Idx]; + if (E.State == TreeEntry::SplitVectorize) + return false; if (!E.isGather()) continue; if ((E.hasState() && E.getOpcode() != Instruction::Load) || @@ -12856,7 +13366,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); continue; } - if (TE.isGather() && TE.hasState()) { + if (TE.hasState() && + (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) { if (const TreeEntry *E = getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars); E && E->getVectorFactor() == TE.getVectorFactor()) { @@ -13620,6 +14131,19 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( break; VToTEs.insert(TEPtr); } + if (ArrayRef VTEs = getSplitTreeEntries(V); !VTEs.empty()) { + const TreeEntry *VTE = VTEs.front(); + if (none_of(TE->CombinedEntriesWithIndices, + [&](const auto &P) { return P.first == VTE->Idx; })) { + Instruction &LastBundleInst = getLastInstructionInBundle(VTE); + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + continue; + } + // The node is reused - exit. + if (CheckAndUseSameNode(VTE)) + break; + VToTEs.insert(VTE); + } if (ArrayRef VTEs = getTreeEntries(V); !VTEs.empty()) { const TreeEntry *VTE = VTEs.front(); if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) && @@ -14173,6 +14697,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { assert(((GatheredLoadsEntriesFirst.has_value() && E->getOpcode() == Instruction::Load && E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) || + E->State == TreeEntry::SplitVectorize || all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && @@ -14198,6 +14723,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { } assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || + E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() && @@ -14259,8 +14785,14 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return FirstInst; }; + if (E->State == TreeEntry::SplitVectorize) { + Res = FindLastInst(); + return *Res; + } + // Set insertpoint for gathered loads to the very first load. - if (GatheredLoadsEntriesFirst.has_value() && + if (E->State != TreeEntry::SplitVectorize && + GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load) { Res = FindFirstInst(); @@ -14339,7 +14871,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { bool IsPHI = isa(LastInst); if (IsPHI) LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); - if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) { + if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars)) || + (GatheredLoadsEntriesFirst.has_value() && + E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && + E->getOpcode() == Instruction::Load)) { Builder.SetInsertPoint(LastInst->getParent(), LastInstIt); } else { // Set the insertion point after the last instruction in the bundle. Set the @@ -15145,7 +15680,9 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { // correctness of the transformations in many cases. auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), [E, NodeIdx](const std::unique_ptr &TE) { - return TE->isOperandGatherNode({E, NodeIdx}); + return TE->isOperandGatherNode({E, NodeIdx}) || + (TE->State == TreeEntry::SplitVectorize && + TE->UserTreeIndex == EdgeInfo(E, NodeIdx)); }); assert(I != VectorizableTree.end() && "Gather node is not in the graph."); assert(I->get()->UserTreeIndex && @@ -15683,6 +16220,83 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = Vec; return Vec; } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + setInsertPointAfterBundle(E); + TreeEntry &OpTE1 = + *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get(); + assert(OpTE1.isSame( + ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && + "Expected same first part of scalars."); + Value *Op1 = vectorizeTree(&OpTE1); + TreeEntry &OpTE2 = + *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get(); + assert( + OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) && + "Expected same second part of scalars."); + Value *Op2 = vectorizeTree(&OpTE2); + auto GetOperandSignedness = [&](const TreeEntry *OpE) { + bool IsSigned = false; + auto It = MinBWs.find(OpE); + if (It != MinBWs.end()) + IsSigned = It->second.second; + else + IsSigned = any_of(OpE->Scalars, [&](Value *R) { + if (isa(V)) + return false; + return !isKnownNonNegative(R, SimplifyQuery(*DL)); + }); + return IsSigned; + }; + if (cast(Op1->getType())->getElementType() != ScalarTy) { + assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); + Op1 = Builder.CreateIntCast( + Op1, + getWidenedType( + ScalarTy, + cast(Op1->getType())->getNumElements()), + GetOperandSignedness(&OpTE1)); + } + if (cast(Op2->getType())->getElementType() != ScalarTy) { + assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); + Op2 = Builder.CreateIntCast( + Op2, + getWidenedType( + ScalarTy, + cast(Op2->getType())->getNumElements()), + GetOperandSignedness(&OpTE2)); + } + if (E->ReorderIndices.empty()) { + SmallVector Mask(E->getVectorFactor(), PoisonMaskElem); + std::iota( + Mask.begin(), + std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second), + 0); + Value *Vec = Builder.CreateShuffleVector(Op1, Mask); + Vec = createInsertVector(Builder, Vec, Op2, + E->CombinedEntriesWithIndices.back().second); + E->VectorizedValue = Vec; + return Vec; + } + unsigned CommonVF = + std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor()); + if (getNumElements(Op1->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()), + 0); + Op1 = Builder.CreateShuffleVector(Op1, Mask); + } + if (getNumElements(Op2->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()), + 0); + Op2 = Builder.CreateShuffleVector(Op2, Mask); + } + Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask()); + E->VectorizedValue = Vec; + return Vec; + } bool IsReverseOrder = !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); @@ -17138,7 +17752,7 @@ Value *BoUpSLP::vectorizeTree( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -17191,6 +17805,9 @@ Value *BoUpSLP::vectorizeTree( VectorizableTree.front().get()) || (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() && IE->UserTreeIndex.EdgeIdx == UINT_MAX))) && + !(VectorizableTree.front()->State == TreeEntry::SplitVectorize && + IE->UserTreeIndex && + is_contained(VectorizableTree.front()->Scalars, I)) && !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && @@ -18228,6 +18845,13 @@ bool BoUpSLP::collectValuesToDemote( ToDemote.push_back(E.Idx); return IsProfitableToDemote; }; + + if (E.State == TreeEntry::SplitVectorize) + return TryProcessInstruction( + BitWidth, + {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), + VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index c431b058f0d2d..92027d0043f76 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,15 +17,12 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] ; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 18acae5835724..0f56862446a9d 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s +; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v | FileCheck %s ; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 | FileCheck %s --check-prefix=THR15 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) { @@ -17,78 +17,122 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4) -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8) -; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12) -; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4) -; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP92]], i64 8) -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP132]], i64 12) -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP3]], i32 5 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP52]], i32 9 -; CHECK-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0) -; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4) -; CHECK-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP98]], i64 8) -; CHECK-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP138]], i64 12) -; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16) -; CHECK-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]] -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP70]] -; CHECK-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP70]] -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]] -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15) -; CHECK-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537) -; CHECK-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535) -; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]] -; CHECK-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]] -; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) -; CHECK-NEXT: ret i32 [[ADD113_3]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]] +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; CHECK-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]] +; CHECK-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]] +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2) +; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; CHECK-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4) +; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; CHECK-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4) +; CHECK-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]] +; CHECK-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]] +; CHECK-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15) +; CHECK-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537) +; CHECK-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535) +; CHECK-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]] +; CHECK-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]] +; CHECK-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]]) +; CHECK-NEXT: ret i32 [[TMP114]] ; ; THR15-LABEL: define i32 @test( ; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { @@ -104,78 +148,122 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 ; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1 ; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 -; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; THR15-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 -; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; THR15-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; THR15-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; THR15-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0) -; THR15-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4) -; THR15-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8) -; THR15-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12) -; THR15-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; THR15-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0) -; THR15-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4) -; THR15-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP143]], i64 8) -; THR15-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP148]], i64 12) -; THR15-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; THR15-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]] -; THR15-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; THR15-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> -; THR15-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> -; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <16 x i32> -; THR15-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <16 x i32> -; THR15-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> -; THR15-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP1]], i32 5 -; THR15-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP48]], i32 9 -; THR15-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32> -; THR15-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; THR15-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0) -; THR15-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4) -; THR15-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP147]], i64 8) -; THR15-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP153]], i64 12) -; THR15-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> -; THR15-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]] -; THR15-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16) -; THR15-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]] -; THR15-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP70]] -; THR15-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP70]] -; THR15-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> -; THR15-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP52]] -; THR15-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP52]] -; THR15-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> -; THR15-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]] -; THR15-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]] -; THR15-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> -; THR15-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; THR15-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]] -; THR15-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]] -; THR15-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; THR15-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> -; THR15-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15) -; THR15-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537) -; THR15-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535) -; THR15-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]] -; THR15-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]] -; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) -; THR15-NEXT: ret i32 [[ADD113_3]] +; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; THR15-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> +; THR15-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; THR15-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; THR15-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; THR15-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; THR15-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; THR15-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; THR15-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]] +; THR15-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]] +; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; THR15-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; THR15-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; THR15-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; THR15-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; THR15-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; THR15-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; THR15-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; THR15-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; THR15-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]] +; THR15-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]] +; THR15-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]] +; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> +; THR15-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; THR15-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; THR15-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; THR15-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] +; THR15-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; THR15-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32> +; THR15-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; THR15-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32> +; THR15-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; THR15-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; THR15-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; THR15-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]] +; THR15-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]] +; THR15-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; THR15-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> +; THR15-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; THR15-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; THR15-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 +; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; THR15-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2) +; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; THR15-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; THR15-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; THR15-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; THR15-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; THR15-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]] +; THR15-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]] +; THR15-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]] +; THR15-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; THR15-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; THR15-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]] +; THR15-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> +; THR15-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; THR15-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; THR15-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4) +; THR15-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; THR15-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; THR15-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4) +; THR15-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]] +; THR15-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]] +; THR15-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> +; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; THR15-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> +; THR15-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> +; THR15-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> +; THR15-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> +; THR15-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> +; THR15-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15) +; THR15-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537) +; THR15-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535) +; THR15-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]] +; THR15-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]] +; THR15-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]]) +; THR15-NEXT: ret i32 [[TMP114]] ; entry: %0 = load i8, ptr %pix1, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 7723746dda301..5d9975b25c381 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1022,10 +1022,8 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index e24c52ba81ddf..3e2c305dbed65 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -1,17 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { -; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE2-LABEL: @sitofp_uitofp( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @sitofp_uitofp( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @sitofp_uitofp( +; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @sitofp_uitofp( +; AVX2-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @sitofp_uitofp( +; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -41,11 +69,39 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { } define <8 x i32> @fptosi_fptoui(<8 x float> %a) { -; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE2-LABEL: @fptosi_fptoui( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @fptosi_fptoui( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX-LABEL: @fptosi_fptoui( +; AVX-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @fptosi_fptoui( +; AVX2-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @fptosi_fptoui( +; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -75,11 +131,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -125,11 +209,39 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { } define <8 x i32> @sext_zext(<8 x i16> %a) { -; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE2-LABEL: @sext_zext( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @sext_zext( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX-LABEL: @sext_zext( +; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @sext_zext( +; AVX2-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @sext_zext( +; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 0f8751a6da7f5..880523d6474ac 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -1,17 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { -; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE2-LABEL: @sitofp_uitofp( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @sitofp_uitofp( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @sitofp_uitofp( +; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @sitofp_uitofp( +; AVX2-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @sitofp_uitofp( +; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -41,11 +69,39 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { } define <8 x i32> @fptosi_fptoui(<8 x float> %a) { -; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE2-LABEL: @fptosi_fptoui( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @fptosi_fptoui( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX-LABEL: @fptosi_fptoui( +; AVX-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @fptosi_fptoui( +; AVX2-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @fptosi_fptoui( +; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -75,11 +131,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -125,11 +209,39 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { } define <8 x i32> @sext_zext(<8 x i16> %a) { -; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE2-LABEL: @sext_zext( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @sext_zext( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX-LABEL: @sext_zext( +; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @sext_zext( +; AVX2-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @sext_zext( +; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll index 5a1de4f3e3d7f..5cee6984df04f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,17 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fadd_fsub_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fmul_fdiv_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX-NEXT: ret <4 x float> [[TMP1]] ; +; AVX2-LABEL: @fmul_fdiv_v4f32_const( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: ret <4 x float> [[TMP1]] +; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( ; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX512-NEXT: ret <4 x float> [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll index 046ed781f4c8d..9a2f959ac63bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,17 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fadd_fsub_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fmul_fdiv_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX-NEXT: ret <4 x float> [[TMP1]] ; +; AVX2-LABEL: @fmul_fdiv_v4f32_const( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: ret <4 x float> [[TMP1]] +; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( ; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX512-NEXT: ret <4 x float> [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index 8839fc2281788..f8c5df9944538 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index dfa918a6ea453..b84ef027f67c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index b659c10bb2fbf..7ed5f33c9dc6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -7,7 +7,7 @@ define void @test() { ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]] @@ -16,6 +16,8 @@ define void @test() { ; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll index 48b04201d1acc..e42e6183b8cae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll @@ -15,8 +15,8 @@ define ptr @test(ptr %0, ptr %args_gep) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[ARG26]], i64 17 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]] ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll index 9fc2b7d6e7865..70c67ff251d6d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll @@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> ; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index c3122d991da20..faaac0c7614f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -286,8 +286,8 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index cfbfd0ebc37bc..ea497c95d4114 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -10,22 +10,24 @@ define i32 @bar() local_unnamed_addr { ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[SUB102_1]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> , <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[SUB102_1]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[SUB86_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8) ; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15) ; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537) ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535) -; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]] ; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]]) ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll index 2f49a2e6a212e..e9a65bf6d6f0d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll @@ -6,11 +6,11 @@ define i1 @foo() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0 ; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0 -; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL_NOT_NOT509_I_1_2_I_I]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> [[TMP2]], <2 x i1> zeroinitializer, i64 6) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[STOREMERGE_2333_I_I]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index df85656800aac..17ae33652b6d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -153,8 +153,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> @@ -283,7 +283,7 @@ define void @test(ptr %i1, ptr %i2, ptr %o, i1 %arg) { ; CHECK-NEXT: [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16 ; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1 ; CHECK-NEXT: [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16 -; CHECK-NEXT: br i1 %arg, label [[THEN:%.*]], label [[END:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16 ; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll index 787bd39759dc7..b4e66138578df 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll @@ -103,10 +103,10 @@ define void @test2(ptr %p1, ptr %p2) { ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> , [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x double> [ [[TMP11]], [[BB1]] ], [ [[TMP16:%.*]], [[BB6:%.*]] ] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15:%.*]], [[BB6:%.*]] ] ; CHECK-NEXT: [[X0:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, ptr [[X0]], align 8 ; CHECK-NEXT: br i1 poison, label [[BB3:%.*]], label [[BB6]] @@ -117,8 +117,7 @@ define void @test2(ptr %p1, ptr %p2) { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB6]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ] -; CHECK-NEXT: [[TMP16]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP15]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll index 9682567b173c3..cda88620ab88a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -11,9 +11,9 @@ define void @test() { ; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll index 8a017a397cff9..3b9222b7d5ed1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-10 < %s | FileCheck %s +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-4 < %s | FileCheck %s define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-LABEL: define i32 @test( @@ -13,7 +13,7 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-NEXT: br i1 false, label %[[D_EXIT_3]], label %[[D_EXIT_6:.*]] ; CHECK: [[D_EXIT_3]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[ENTRY]] ], [ poison, %[[IF_END_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[RETVAL_0_I_219]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> , i32 [[RETVAL_0_I_219]], i32 0 ; CHECK-NEXT: br i1 [[TOBOOL_I_4]], label %[[D_EXIT_4:.*]], label %[[D_EXIT_6]] ; CHECK: [[D_EXIT_4]]: ; CHECK-NEXT: br label %[[D_EXIT_6]] @@ -21,25 +21,29 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-NEXT: br i1 false, label %[[D_EXIT_6]], label %[[D_EXIT_7:.*]] ; CHECK: [[D_EXIT_6]]: ; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] ; CHECK-NEXT: br label %[[D_EXIT_7]] ; CHECK: [[D_EXIT_7]]: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[RETVAL_0_I_219]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i32> [[TMP11]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> , i32 [[RETVAL_0_I_219]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3 ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP21]], <4 x i32> [[TMP10]], i64 4) +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]] +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP19]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> [[TMP20]], [[TMP16]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP19]], <4 x i32> [[RDX_OP]], i64 0) ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP16]]) -; CHECK-NEXT: [[OP_RDX4:%.*]] = or i32 [[TMP18]], [[TMP17]] -; CHECK-NEXT: ret i32 [[OP_RDX4]] +; CHECK-NEXT: ret i32 [[TMP17]] ; entry: %0 = load i32, ptr %f, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index c01c44ff03c15..1294a87ff6967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -7,20 +7,14 @@ define void @test(i1 %c, ptr %arg) { ; CHECK: if: ; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll index 33fa00c1881da..38e9ba7ce7028 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll @@ -6,23 +6,19 @@ define i32 @a() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]] -; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4 -; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4 ; CHECK-NEXT: br label %[[BB1]] ; br label %1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 0ed12760b563f..e3a6020a542fb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index f47373747e578..cea98bf55b6ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll index d650a972ad8ca..7060288d739bd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll @@ -29,8 +29,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/addsub.ll b/llvm/test/Transforms/SLPVectorizer/addsub.ll index 3961250d56451..6814bc0f566f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/addsub.ll @@ -387,14 +387,10 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32>