From 57a6b0526cbcd3630fdad3f15a78777ccc8ae6d7 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 26 Feb 2025 16:44:45 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../llvm/Analysis/TargetTransformInfo.h | 8 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + .../Target/RISCV/RISCVTargetTransformInfo.h | 2 + llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 + .../Transforms/Vectorize/SLPVectorizer.cpp | 748 +++++++++++++-- .../SLPVectorizer/AArch64/tsc-s116.ll | 13 +- .../SLPVectorizer/RISCV/complex-loads.ll | 854 +++++------------- .../SLPVectorizer/RISCV/reductions.ll | 6 +- .../X86/alternate-cast-inseltpoison.ll | 74 +- .../SLPVectorizer/X86/alternate-cast.ll | 74 +- .../X86/alternate-fp-inseltpoison.ll | 98 +- .../SLPVectorizer/X86/alternate-fp.ll | 98 +- .../X86/alternate-int-inseltpoison.ll | 106 ++- .../SLPVectorizer/X86/alternate-int.ll | 106 ++- .../X86/buildvector-schedule-for-subvector.ll | 4 +- .../X86/gathered-shuffle-resized.ll | 4 +- .../SLPVectorizer/X86/long-full-reg-stores.ll | 8 +- .../Transforms/SLPVectorizer/X86/lookahead.ll | 2 +- .../X86/matched-shuffled-entries.ll | 28 +- .../X86/non-load-reduced-as-part-of-bv.ll | 10 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 4 +- .../SLPVectorizer/X86/reorder-phi-operand.ll | 7 +- .../X86/reorder_diamond_match.ll | 6 +- .../X86/same-values-sub-node-with-poisons.ll | 36 +- .../X86/scatter-vectorize-reused-pointer.ll | 10 +- .../X86/splat-score-adjustment.ll | 22 +- .../X86/vec_list_bias-inseltpoison.ll | 2 +- .../SLPVectorizer/X86/vec_list_bias.ll | 2 +- .../vec_list_bias_external_insert_shuffled.ll | 2 +- llvm/test/Transforms/SLPVectorizer/addsub.ll | 12 +- 31 files changed, 1446 insertions(+), 907 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index e1bebb01372e0..74318beee3f06 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1778,6 +1778,10 @@ class TargetTransformInfo { /// scalable version of the vectorized loop. bool preferFixedOverScalableIfEqualCost() const; + /// \returns True if target prefers SLP vectorizer with altermate opcode + /// vectorization, false - otherwise. + bool preferAlternateOpcodeVectorization() const; + /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; @@ -2331,6 +2335,7 @@ class TargetTransformInfo::Concept { unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; virtual bool preferFixedOverScalableIfEqualCost() const = 0; + virtual bool preferAlternateOpcodeVectorization() const = 0; virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, @@ -3142,6 +3147,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool preferFixedOverScalableIfEqualCost() const override { return Impl.preferFixedOverScalableIfEqualCost(); } + bool preferAlternateOpcodeVectorization() const override { + return Impl.preferAlternateOpcodeVectorization(); + } bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index a8d6dd18266bb..fc70a096db1cf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1006,6 +1006,8 @@ class TargetTransformInfoImplBase { bool preferFixedOverScalableIfEqualCost() const { return false; } + bool preferAlternateOpcodeVectorization() const { return true; } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 7df7038f6dd47..a3c9ded3c47d4 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1380,6 +1380,10 @@ bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const { return TTIImpl->preferFixedOverScalableIfEqualCost(); } +bool TargetTransformInfo::preferAlternateOpcodeVectorization() const { + return TTIImpl->preferAlternateOpcodeVectorization(); +} + bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 134a7333b9b06..6204ff88814f0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -119,6 +119,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + bool preferAlternateOpcodeVectorization() const { return false; } + bool preferEpilogueVectorization() const { // Epilogue vectorization is usually unprofitable - tail folding or // a smaller VF would have been better. This a blunt hammer - we diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 7786616f89aa6..d344fdb149517 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -292,6 +292,7 @@ class X86TTIImpl : public BasicTTIImplBase { TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; + bool preferAlternateOpcodeVectorization() const { return false; } bool prefersVectorizedAddressing() const; bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5fc5fb10fad55..a1edde3f72ff3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -840,6 +840,35 @@ class InstructionsState { return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } + /// Checks if main/alt instructions are shift operations. + bool isShiftOp() const { + return getMainOp()->isShift() && getAltOp()->isShift(); + } + + /// Checks if main/alt instructions are bitwise logic operations. + bool isBitwiseLogicOp() const { + return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp(); + } + + /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations. + bool isMulDivLikeOp() const { + constexpr std::array MulDiv = { + Instruction::Mul, Instruction::FMul, Instruction::SDiv, + Instruction::UDiv, Instruction::FDiv, Instruction::SRem, + Instruction::URem, Instruction::FRem}; + return is_contained(MulDiv, getOpcode()) && + is_contained(MulDiv, getAltOpcode()); + } + + /// Checks if main/alt instructions are add/sub/fadd/fsub operations. + bool isAddSubLikeOp() const { + constexpr std::array AddSub = { + Instruction::Add, Instruction::Sub, Instruction::FAdd, + Instruction::FSub}; + return is_contained(AddSub, getOpcode()) && + is_contained(AddSub, getAltOpcode()); + } + /// Checks if the current state is valid, i.e. has non-null MainOp bool valid() const { return MainOp && AltOp; } @@ -1471,6 +1500,7 @@ class BoUpSLP { void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntries.clear(); + ScalarsInSplitNodes.clear(); MustGather.clear(); NonScheduledFirst.clear(); EntryToLastInstruction.clear(); @@ -1506,7 +1536,7 @@ class BoUpSLP { /// should be represented as an empty order, so this is used to /// decide if we can canonicalize a computed order. Undef elements /// (represented as size) are ignored. - bool isIdentityOrder(ArrayRef Order) const { + static bool isIdentityOrder(ArrayRef Order) { assert(!Order.empty() && "expected non-empty order"); const unsigned Sz = Order.size(); return all_of(enumerate(Order), [&](const auto &P) { @@ -3221,12 +3251,35 @@ class BoUpSLP { /// \returns Common mask for reorder indices and reused scalars. SmallVector getCommonMask() const { + if (State == TreeEntry::SplitVectorize) + return {}; SmallVector Mask; inversePermutation(ReorderIndices, Mask); ::addMask(Mask, ReuseShuffleIndices); return Mask; } + /// \returns The mask for split nodes. + SmallVector getSplitMask() const { + assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() && + "Expected only split vectorize node."); + SmallVector Mask(getVectorFactor(), PoisonMaskElem); + unsigned CommonVF = std::max( + CombinedEntriesWithIndices.back().second, + Scalars.size() - CombinedEntriesWithIndices.back().second); + for (auto [Idx, I] : enumerate(ReorderIndices)) + Mask[I] = + Idx + (Idx >= CombinedEntriesWithIndices.back().second + ? CommonVF - CombinedEntriesWithIndices.back().second + : 0); + return Mask; + } + + /// Updates (reorders) SplitVectorize node according to the given mask \p + /// Mask and order \p MaskOrder. + void reorderSplitNode(unsigned Idx, ArrayRef Mask, + ArrayRef MaskOrder); + /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { auto &&IsSame = [VL](ArrayRef Scalars, ArrayRef Mask) { @@ -3314,6 +3367,8 @@ class BoUpSLP { ///< complex node like select/cmp to minmax, mul/add to ///< fma, etc. Must be used for the following nodes in ///< the pattern, not the very first one. + SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them + ///< independently and then combines back. }; EntryState State; @@ -3344,7 +3399,7 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. unsigned Idx = 0; - /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// For gather/buildvector/alt opcode nodes, which are combined from /// other nodes as a series of insertvector instructions. SmallVector, 2> CombinedEntriesWithIndices; @@ -3539,6 +3594,9 @@ class BoUpSLP { case CombinedVectorize: dbgs() << "CombinedVectorize\n"; break; + case SplitVectorize: + dbgs() << "SplitVectorize\n"; + break; } if (S) { dbgs() << "MainOp: " << *S.getMainOp() << "\n"; @@ -3619,8 +3677,10 @@ class BoUpSLP { const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = {}, ArrayRef ReorderIndices = {}) { - assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || - (Bundle && EntryState != TreeEntry::NeedToGather)) && + assert(((!Bundle && (EntryState == TreeEntry::NeedToGather || + EntryState == TreeEntry::SplitVectorize)) || + (Bundle && EntryState != TreeEntry::NeedToGather && + EntryState != TreeEntry::SplitVectorize)) && "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && @@ -3654,11 +3714,38 @@ class BoUpSLP { return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - if (S) + if (S) { Last->setOperations(S); + } else if (EntryState == TreeEntry::SplitVectorize) { + auto *MainOp = + cast(*find_if(Last->Scalars, IsaPred)); + auto *AltOp = cast(*find_if(Last->Scalars, [=](Value *V) { + auto *I = dyn_cast(V); + return I && I->getOpcode() != MainOp->getOpcode(); + })); + Last->setOperations(InstructionsState(MainOp, AltOp)); + } + if (EntryState == TreeEntry::SplitVectorize) { + SmallPtrSet Processed; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + auto It = ScalarsInSplitNodes.find(V); + if (It == ScalarsInSplitNodes.end()) { + ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back( + Last); + (void)Processed.insert(V); + } else if (Processed.insert(V).second) { + assert(!is_contained(It->getSecond(), Last) && + "Value already associated with the node."); + It->getSecond().push_back(Last); + } + } + } Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } - if (!Last->isGather()) { + if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) { SmallPtrSet Processed; for (Value *V : VL) { if (isa(V)) @@ -3695,7 +3782,7 @@ class BoUpSLP { } } assert(!BundleMember && "Bundle and VL out of sync"); - } else { + } else if (Last->isGather()) { // Build a map for gathered scalars to the nodes where they are used. bool AllConstsOrCasts = true; for (Value *V : VL) @@ -3740,6 +3827,15 @@ class BoUpSLP { return It->getSecond(); } + /// Get list of split vector entries, associated with the value \p V. + ArrayRef getSplitTreeEntries(Value *V) const { + assert(V && "V cannot be nullptr."); + auto It = ScalarsInSplitNodes.find(V); + if (It == ScalarsInSplitNodes.end()) + return {}; + return It->getSecond(); + } + /// Returns first vector node for value \p V, matching values \p VL. TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef VL, bool SameVF = false) const { @@ -3770,6 +3866,9 @@ class BoUpSLP { /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; + /// Scalars, used in split vectorize nodes. + SmallDenseMap> ScalarsInSplitNodes; + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; @@ -5720,12 +5819,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; - if ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && - (isa(TE.getMainOp()) || - (TopToBottom && isa(TE.getMainOp())))) { - assert(!TE.isAltShuffle() && "Alternate instructions are only supported by " - "BinaryOperator and CastInst."); + if (TE.State == TreeEntry::SplitVectorize || + ((TE.State == TreeEntry::Vectorize || + TE.State == TreeEntry::StridedVectorize) && + (isa(TE.getMainOp()) || + (TopToBottom && isa(TE.getMainOp()))))) { + assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && + "Alternate instructions are only supported by " + "BinaryOperator and CastInst."); return TE.ReorderIndices; } if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { @@ -5836,7 +5937,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; // No need to reorder. return std::move(Phis); } - if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) && + if (TE.isGather() && + (!TE.hasState() || !TE.isAltShuffle() || + ScalarsInSplitNodes.contains(TE.getMainOp())) && allSameType(TE.Scalars)) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. @@ -6044,6 +6147,30 @@ bool BoUpSLP::isProfitableToReorder() const { return true; } +void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef Mask, + ArrayRef MaskOrder) { + assert(State == TreeEntry::SplitVectorize && "Expected split user node."); + SmallVector NewMask(getVectorFactor()); + SmallVector NewMaskOrder(getVectorFactor()); + std::iota(NewMask.begin(), NewMask.end(), 0); + std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0); + if (Idx == 0) { + copy(Mask, NewMask.begin()); + copy(MaskOrder, NewMaskOrder.begin()); + } else { + assert(Idx == 1 && "Expected either 0 or 1 index."); + unsigned Offset = CombinedEntriesWithIndices.back().second; + for (unsigned I : seq(Mask.size())) { + NewMask[I + Offset] = Mask[I] + Offset; + NewMaskOrder[I + Offset] = MaskOrder[I] + Offset; + } + } + reorderScalars(Scalars, NewMask); + reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true); + if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -6078,7 +6205,8 @@ void BoUpSLP::reorderTopToBottom() { // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->hasState() && TE->isAltShuffle()) { + if (TE->hasState() && TE->isAltShuffle() && + TE->State != TreeEntry::SplitVectorize) { VectorType *VecTy = getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); @@ -6119,7 +6247,8 @@ void BoUpSLP::reorderTopToBottom() { } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6150,7 +6279,8 @@ void BoUpSLP::reorderTopToBottom() { for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) && + OpTE->State != TreeEntry::SplitVectorize) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, @@ -6257,14 +6387,17 @@ void BoUpSLP::reorderTopToBottom() { // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { + assert(TE->State != TreeEntry::SplitVectorize && + "Split vectorized not expected."); // Need to reorder the reuses masks of the operands with smaller VF to // be able to find the match between the graph nodes and scalar // operands of the given node during vectorization/cost estimation. - assert((!TE->UserTreeIndex || - TE->UserTreeIndex.UserTE->Scalars.size() == VF || - TE->UserTreeIndex.UserTE->Scalars.size() == - TE->Scalars.size()) && - "All users must be of VF size."); + assert( + (!TE->UserTreeIndex || + TE->UserTreeIndex.UserTE->Scalars.size() == VF || + TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() || + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) && + "All users must be of VF size."); if (SLPReVec) { assert(SLPReVec && "Only supported by REVEC."); // ShuffleVectorInst does not do reorderOperands (and it should not @@ -6281,19 +6414,28 @@ void BoUpSLP::reorderTopToBottom() { // Update ordering of the operands with the smaller VF than the given // one. reorderNodeWithReuses(*TE, Mask); + // Update orders in user split vectorize nodes. + if (TE->UserTreeIndex && + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) + TE->UserTreeIndex.UserTE->reorderSplitNode( + TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder); } continue; } - if ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && - (isa(TE->getMainOp()) || - (SLPReVec && isa(TE->getMainOp())))) { - assert(!TE->isAltShuffle() && - "Alternate instructions are only supported by BinaryOperator " - "and CastInst."); - // Build correct orders for extract{element,value}, loads and - // stores. + if ((TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty()) || + ((TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::StridedVectorize) && + (isa(TE->getMainOp()) || + (SLPReVec && isa(TE->getMainOp()))))) { + assert( + (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty())) && + "Alternate instructions are only supported by BinaryOperator " + "and CastInst."); + // Build correct orders for extract{element,value}, loads, + // stores and alternate (split) nodes. reorderOrder(TE->ReorderIndices, Mask); if (isa(TE->getMainOp())) TE->reorderOperands(Mask); @@ -6314,6 +6456,11 @@ void BoUpSLP::reorderTopToBottom() { addMask(NewReuses, TE->ReuseShuffleIndices); TE->ReuseShuffleIndices.swap(NewReuses); } + // Update orders in user split vectorize nodes. + if (TE->UserTreeIndex && + TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) + TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx, + Mask, MaskOrder); } } } @@ -6326,7 +6473,8 @@ bool BoUpSLP::canReorderOperands( if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || - OpData.second->State == TreeEntry::StridedVectorize); + OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::SplitVectorize); })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { @@ -6340,6 +6488,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); continue; @@ -6349,6 +6498,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && TE->UserTreeIndex.EdgeIdx == I) { @@ -6368,7 +6518,14 @@ bool BoUpSLP::canReorderOperands( } void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { - SetVector OrderedEntries; + struct TreeEntryCompare { + bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const { + if (LHS->UserTreeIndex && RHS->UserTreeIndex) + return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx; + return LHS->Idx < RHS->Idx; + } + }; + PriorityQueue, TreeEntryCompare> Queue; DenseSet GathersToOrders; // Find all reorderable leaf nodes with the given VF. // Currently the are vectorized loads,extracts without alternate operands + @@ -6376,13 +6533,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector NonVectorized; for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::StridedVectorize) + TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) { - OrderedEntries.insert(TE.get()); + Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); } @@ -6393,40 +6552,88 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // one operand order in the natural order and reorder others + reorder the // user node itself. SmallPtrSet Visited, RevisitedOps; - while (!OrderedEntries.empty()) { + while (!Queue.empty()) { // 1. Filter out only reordered nodes. - DenseMap>> Users; - SmallVector Filtered; - for (TreeEntry *TE : OrderedEntries) { + std::pair>> Users; + TreeEntry *TE = Queue.top(); + const TreeEntry *UserTE = TE->UserTreeIndex.UserTE; + Queue.pop(); + SmallVector OrderedOps(1, TE); + while (!Queue.empty()) { + TE = Queue.top(); + if (!UserTE || UserTE != TE->UserTreeIndex.UserTE) + break; + Queue.pop(); + OrderedOps.push_back(TE); + } + for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || - !Visited.insert(TE).second) { - Filtered.push_back(TE); + !Visited.insert(TE).second) continue; - } // Build a map between user nodes and their operands order to speedup // search. The graph currently does not provide this dependency directly. - Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx, - TE); - } - // Erase filtered entries. - for (TreeEntry *TE : Filtered) - OrderedEntries.remove(TE); - SmallVector< - std::pair>>> - UsersVec(Users.begin(), Users.end()); - sort(UsersVec, [](const auto &Data1, const auto &Data2) { - return Data1.first->Idx > Data2.first->Idx; - }); - for (auto &Data : UsersVec) { + Users.first = TE->UserTreeIndex.UserTE; + Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE); + } + if (Users.first) { + auto &Data = Users; + if (Data.first->State == TreeEntry::SplitVectorize) { + assert( + Data.second.size() <= 2 && + "Expected not greater than 2 operands for split vectorize node."); + if (any_of(Data.second, + [](const auto &Op) { return !Op.second->UserTreeIndex; })) + continue; + // Update orders in user split vectorize nodes. + assert(Data.first->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 entries."); + for (const auto &P : Data.first->CombinedEntriesWithIndices) { + TreeEntry &OpTE = *VectorizableTree[P.first].get(); + OrdersType Order = OpTE.ReorderIndices; + if (Order.empty()) { + if (!OpTE.isGather()) + continue; + const auto BestOrder = + getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder); + if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder)) + continue; + Order = *BestOrder; + } + fixupOrderingIndices(Order); + SmallVector Mask; + inversePermutation(Order, Mask); + const unsigned E = Order.size(); + SmallVector MaskOrder(E, PoisonMaskElem); + transform(Order, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : PoisonMaskElem; + }); + Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder); + // Clear ordering of the operand. + if (!OpTE.ReorderIndices.empty()) { + OpTE.ReorderIndices.clear(); + } else { + assert(OpTE.isGather() && "Expected only gather/buildvector node."); + reorderScalars(OpTE.Scalars, Mask); + } + } + if (Data.first->ReuseShuffleIndices.empty() && + !Data.first->ReorderIndices.empty()) { + // Insert user node to the list to try to sink reordering deeper in + // the graph. + Queue.push(Data.first); + } + continue; + } // Check that operands are used only in the User node. SmallVector GatherOps; if (!canReorderOperands(Data.first, Data.second, NonVectorized, GatherOps)) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } // All operands are reordered and used only in this node - propagate the @@ -6519,6 +6726,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { UTE->UserTreeIndex.UserTE == Data.first) || (Data.first->UserTreeIndex && Data.first->UserTreeIndex.UserTE == UTE) || + (IgnoreReorder && UTE->UserTreeIndex && + UTE->UserTreeIndex.UserTE->Idx == 0) || NodeShouldBeReorderedWithOperands(UTE); })) continue; @@ -6532,7 +6741,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; const TreeEntry *Op = getOperandEntry(UTE, Idx); Visited.erase(Op); - OrderedEntries.insert(const_cast(Op)); + Queue.push(const_cast(Op)); } } } @@ -6589,7 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // the compile time. // Profitable to reorder if definitely more operands allow // reordering rather than those with natural order. - ArrayRef> Ops = Users[UserTE]; + ArrayRef> Ops = Users.second; if (static_cast(count_if( Ops, [UserTE, &AllowsReordering]( const std::pair &Op) { @@ -6601,7 +6810,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } if (OrdersUses.empty()) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } // Choose the most used order. @@ -6631,7 +6840,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Set order of the user node. if (isIdentityOrder(BestOrder)) { for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); + Visited.insert(Op.second); continue; } fixupOrderingIndices(BestOrder); @@ -6646,7 +6855,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { }); for (const std::pair &Op : Data.second) { TreeEntry *TE = Op.second; - OrderedEntries.remove(TE); if (!VisitedOps.insert(TE).second) continue; if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { @@ -6656,6 +6864,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -6676,7 +6885,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; } reorderScalars(Gather->Scalars, Mask); - OrderedEntries.remove(Gather); + Visited.insert(Gather); } // Reorder operands of the user node and set the ordering for the user // node itself. @@ -6696,7 +6905,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { !Data.first->isAltShuffle()) { // Insert user node to the list to try to sink reordering deeper in // the graph. - OrderedEntries.insert(Data.first); + Queue.push(Data.first); } } else { reorderOrder(Data.first->ReorderIndices, Mask); @@ -6726,7 +6935,7 @@ void BoUpSLP::buildExternalUses( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; // For each lane: @@ -8483,6 +8692,142 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // Tries to build split node. + constexpr unsigned SmallNodeSize = 4; + auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize, + const InstructionsState &LocalState) { + if (VL.size() <= SmallNodeSize || TTI.preferAlternateOpcodeVectorization()) + return false; + + // Any value is used in split node already - just gather. + if (any_of(VL, [&](Value *V) { + return ScalarsInSplitNodes.contains(V) || isVectorized(V); + })) { + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return true; + } + SmallVector Op1, Op2; + OrdersType ReorderIndices(VL.size(), VL.size()); + SmallBitVector Op1Indices(VL.size()); + for (auto [Idx, V] : enumerate(VL)) { + auto *I = dyn_cast(V); + if (!I) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI); + if (NewS && !NewS.isAltShuffle()) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + Op2.push_back(V); + } + Type *ScalarTy = getValueType(VL.front()); + VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); + unsigned Opcode0 = LocalState.getOpcode(); + unsigned Opcode1 = LocalState.getAltOpcode(); + SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); + // Enable split node, only if all nodes do not form legal alternate + // instruction (like X86 addsub). + SmallPtrSet UOp1(Op1.begin(), Op1.end()); + SmallPtrSet UOp2(Op2.begin(), Op2.end()); + if (UOp1.size() <= 1 || UOp2.size() <= 1 || + TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) || + !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) || + !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size())) + return false; + // Enable split node, only if all nodes are power-of-2/full registers. + unsigned Op1Cnt = 0, Op2Cnt = Op1.size(); + for (unsigned Idx : seq(VL.size())) { + if (Op1Indices.test(Idx)) { + ReorderIndices[Op1Cnt] = Idx; + ++Op1Cnt; + } else { + ReorderIndices[Op2Cnt] = Idx; + ++Op2Cnt; + } + } + if (isIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + SmallVector Mask; + if (!ReorderIndices.empty()) + inversePermutation(ReorderIndices, Mask); + unsigned NumParts = TTI.getNumberOfParts(VecTy); + VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size()); + VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size()); + // Check non-profitable single register ops, which better to be represented + // as alternate ops. + if (NumParts >= VL.size()) + return false; + if (LocalState.getMainOp()->isBinaryOp() && + LocalState.getAltOp()->isBinaryOp() && + (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() || + LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) { + constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; + InstructionCost InsertCost = ::getShuffleCost( + TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy); + FixedVectorType *SubVecTy = + getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())); + InstructionCost NewShuffleCost = + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind); + if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost)) + return false; + InstructionCost OriginalVecOpsCost = + TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) + + TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind); + SmallVector OriginalMask(VL.size(), PoisonMaskElem); + for (unsigned Idx : seq(VL.size())) { + if (isa(VL[Idx])) + continue; + OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size()); + } + InstructionCost OriginalCost = + OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + VecTy, OriginalMask, Kind); + InstructionCost NewVecOpsCost = + TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) + + TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind); + InstructionCost NewCost = + NewVecOpsCost + InsertCost + + (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::Store + ? NewShuffleCost + : 0); + // If not profitable to split - exit. + if (NewCost >= OriginalCost) + return false; + } + + SmallVector NewVL(VL.size()); + copy(Op1, NewVL.begin()); + copy(Op2, std::next(NewVL.begin(), Op1.size())); + auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt, + LocalState, UserTreeIdx, {}, ReorderIndices); + LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump()); + auto AddNode = [&](ArrayRef Op, unsigned Idx) { + InstructionsState S = getSameOpcode(Op, *TLI); + if (S && (isa(S.getMainOp()) || + getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) { + // Build gather node for loads, they will be gathered later. + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, + S, {TE, Idx}); + } else { + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + buildTree_rec(Op, Depth, {TE, Idx}); + } + }; + AddNode(Op1, 0); + AddNode(Op2, 1); + return true; + }; + // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. @@ -8568,6 +8913,47 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { + if (!S) { + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + for (Value *V : VL) { + if (isa(V)) + continue; + auto *I = dyn_cast(V); + if (!I) { + MainOp = AltOp = nullptr; + break; + } + if (!MainOp) { + MainOp = I; + continue; + } + if (MainOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != MainOp->getParent()) { + MainOp = AltOp = nullptr; + break; + } + continue; + } + if (!AltOp) { + AltOp = I; + continue; + } + if (AltOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != AltOp->getParent()) { + MainOp = AltOp = nullptr; + break; + } + continue; + } + MainOp = AltOp = nullptr; + break; + } + // Last chance to try to vectorize alternate node. + if (MainOp && AltOp && + TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp))) + return; + } LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -8647,6 +9033,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // FIXME: investigate if there are profitable cases for VL.size() <= 4. + if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S)) + return; + // Check that every instruction appears once in this bundle. if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; @@ -8679,6 +9069,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); + // Last chance to try to vectorize alternate node. + if (S.isAltShuffle() && ReuseShuffleIndices.empty() && + TrySplitNode(SmallNodeSize, S)) + return; newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); NonScheduledFirst.insert(VL.front()); @@ -8823,6 +9217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, TE->dump()); break; case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected loads state."); } @@ -10010,6 +10405,16 @@ void BoUpSLP::transformNodes() { reorderGatherNode(E); } + // Better to use full gathered loads analysis, if there are only 2 loads + // gathered nodes each having less than 16 elements. + constexpr unsigned VFLimit = 16; + bool ForceLoadGather = + count_if(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->isGather() && TE->hasState() && + TE->getOpcode() == Instruction::Load && + TE->getVectorFactor() < VFLimit; + }) == 2; + // The tree may grow here, so iterate over nodes, built before. for (unsigned Idx : seq(BaseGraphSize)) { TreeEntry &E = *VectorizableTree[Idx]; @@ -10024,6 +10429,57 @@ void BoUpSLP::transformNodes() { E.isAltShuffle() || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; + if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load) + continue; + auto AreReusedScalars = [&](const TreeEntry *TE, + function_ref CheckContainer) { + return TE->isSame(VL) || all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + auto *I = dyn_cast(V); + if (!I) + return false; + return is_contained(TE->Scalars, I) || CheckContainer(I); + }); + }; + if (E.hasState()) { + if (ArrayRef TEs = getTreeEntries(E.getMainOp()); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, [&](Value *V) { + ArrayRef VTEs = getTreeEntries(V); + return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + continue; + if (ArrayRef TEs = getSplitTreeEntries(E.getMainOp()); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, [&](Value *V) { + ArrayRef VTEs = getSplitTreeEntries(V); + return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + continue; + } else { + // Check if the gather node full copy of split node. + auto *It = find_if(VL, IsaPred); + if (It != VL.end()) { + if (ArrayRef TEs = getSplitTreeEntries(*It); + !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) { + return AreReusedScalars(TE, [&](Value *V) { + ArrayRef VTEs = getSplitTreeEntries(V); + return !VTEs.empty() && + any_of(VTEs, [&](const TreeEntry *TE) { + return is_contained(TEs, TE); + }); + }); + })) + continue; + } + } // Try to find vectorizable sequences and transform them into a series of // insertvector instructions. unsigned StartIdx = 0; @@ -11257,7 +11713,8 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, } const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), [&](const std::unique_ptr &TE) { - return TE->isGather() && + return (TE->isGather() || + TE->State == TreeEntry::SplitVectorize) && TE->UserTreeIndex.EdgeIdx == Idx && TE->UserTreeIndex.UserTE == E; }); @@ -11315,6 +11772,32 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return processBuildVector( E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask."); + InstructionCost VectorCost = 0; + if (E->ReorderIndices.empty()) { + VectorCost = ::getShuffleCost( + *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind, + E->CombinedEntriesWithIndices.back().second, + getWidenedType( + ScalarTy, + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor())); + } else { + unsigned CommonVF = + std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first] + ->getVectorFactor(), + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor()); + VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonVF), + E->getSplitMask(), CostKind); + } + LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree")); + return VectorCost; + } InstructionCost CommonCost = 0; SmallVector Mask; if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || @@ -11395,7 +11878,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, EI.EdgeIdx != 0) { auto UserBWIt = MinBWs.find(EI.UserTE); Type *UserScalarTy = - EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); + EI.UserTE->State == TreeEntry::SplitVectorize + ? EI.UserTE->Scalars.front()->getType() + : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); if (UserBWIt != MinBWs.end()) UserScalarTy = IntegerType::get(ScalarTy->getContext(), UserBWIt->second.first); @@ -11896,6 +12381,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, break; } case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected vectorization state."); } @@ -12392,6 +12878,8 @@ bool BoUpSLP::isTreeNotExtendable() const { bool Res = false; for (unsigned Idx : seq(getTreeSize())) { TreeEntry &E = *VectorizableTree[Idx]; + if (E.State == TreeEntry::SplitVectorize) + return false; if (!E.isGather()) continue; if ((E.hasState() && E.getOpcode() != Instruction::Load) || @@ -12704,7 +13192,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); continue; } - if (TE.isGather() && TE.hasState()) { + if (TE.hasState() && + (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) { if (const TreeEntry *E = getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars); E && E->getVectorFactor() == TE.getVectorFactor()) { @@ -13442,6 +13931,19 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( break; VToTEs.insert(TEPtr); } + if (ArrayRef VTEs = getSplitTreeEntries(V); !VTEs.empty()) { + const TreeEntry *VTE = VTEs.front(); + if (none_of(TE->CombinedEntriesWithIndices, + [&](const auto &P) { return P.first == VTE->Idx; })) { + Instruction &LastBundleInst = getLastInstructionInBundle(VTE); + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + continue; + } + // The node is reused - exit. + if (CheckAndUseSameNode(VTE)) + break; + VToTEs.insert(VTE); + } if (ArrayRef VTEs = getTreeEntries(V); !VTEs.empty()) { const TreeEntry *VTE = VTEs.front(); if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) && @@ -14009,6 +14511,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { assert(((GatheredLoadsEntriesFirst.has_value() && E->getOpcode() == Instruction::Load && E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) || + E->State == TreeEntry::SplitVectorize || all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && @@ -14034,6 +14537,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { } assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || + E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() && @@ -14095,8 +14599,14 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return FirstInst; }; + if (E->State == TreeEntry::SplitVectorize) { + Res = FindLastInst(); + return *Res; + } + // Set insertpoint for gathered loads to the very first load. - if (GatheredLoadsEntriesFirst.has_value() && + if (E->State != TreeEntry::SplitVectorize && + GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load) { Res = FindFirstInst(); @@ -14175,7 +14685,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { bool IsPHI = isa(LastInst); if (IsPHI) LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); - if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) { + if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars)) || + (GatheredLoadsEntriesFirst.has_value() && + E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && + E->getOpcode() == Instruction::Load)) { Builder.SetInsertPoint(LastInst->getParent(), LastInstIt); } else { // Set the insertion point after the last instruction in the bundle. Set the @@ -14981,7 +15494,9 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { // correctness of the transformations in many cases. auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), [E, NodeIdx](const std::unique_ptr &TE) { - return TE->isOperandGatherNode({E, NodeIdx}); + return TE->isOperandGatherNode({E, NodeIdx}) || + (TE->State == TreeEntry::SplitVectorize && + TE->UserTreeIndex == EdgeInfo(E, NodeIdx)); }); assert(I != VectorizableTree.end() && "Gather node is not in the graph."); assert(I->get()->UserTreeIndex && @@ -15519,6 +16034,83 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = Vec; return Vec; } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + setInsertPointAfterBundle(E); + TreeEntry &OpTE1 = + *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get(); + assert(OpTE1.isSame( + ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && + "Expected same first part of scalars."); + Value *Op1 = vectorizeTree(&OpTE1); + TreeEntry &OpTE2 = + *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get(); + assert( + OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) && + "Expected same second part of scalars."); + Value *Op2 = vectorizeTree(&OpTE2); + auto GetOperandSignedness = [&](const TreeEntry *OpE) { + bool IsSigned = false; + auto It = MinBWs.find(OpE); + if (It != MinBWs.end()) + IsSigned = It->second.second; + else + IsSigned = any_of(OpE->Scalars, [&](Value *R) { + if (isa(V)) + return false; + return !isKnownNonNegative(R, SimplifyQuery(*DL)); + }); + return IsSigned; + }; + if (cast(Op1->getType())->getElementType() != ScalarTy) { + assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); + Op1 = Builder.CreateIntCast( + Op1, + getWidenedType( + ScalarTy, + cast(Op1->getType())->getNumElements()), + GetOperandSignedness(&OpTE1)); + } + if (cast(Op2->getType())->getElementType() != ScalarTy) { + assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); + Op2 = Builder.CreateIntCast( + Op2, + getWidenedType( + ScalarTy, + cast(Op2->getType())->getNumElements()), + GetOperandSignedness(&OpTE2)); + } + if (E->ReorderIndices.empty()) { + SmallVector Mask(E->getVectorFactor(), PoisonMaskElem); + std::iota( + Mask.begin(), + std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second), + 0); + Value *Vec = Builder.CreateShuffleVector(Op1, Mask); + Vec = createInsertVector(Builder, Vec, Op2, + E->CombinedEntriesWithIndices.back().second); + E->VectorizedValue = Vec; + return Vec; + } + unsigned CommonVF = + std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor()); + if (getNumElements(Op1->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()), + 0); + Op1 = Builder.CreateShuffleVector(Op1, Mask); + } + if (getNumElements(Op2->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()), + 0); + Op2 = Builder.CreateShuffleVector(Op2, Mask); + } + Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask()); + E->VectorizedValue = Vec; + return Vec; + } bool IsReverseOrder = !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); @@ -16973,7 +17565,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -17026,6 +17618,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, VectorizableTree.front().get()) || (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() && IE->UserTreeIndex.EdgeIdx == UINT_MAX))) && + !(VectorizableTree.front()->State == TreeEntry::SplitVectorize && + IE->UserTreeIndex && + is_contained(VectorizableTree.front()->Scalars, I)) && !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && @@ -18057,6 +18652,13 @@ bool BoUpSLP::collectValuesToDemote( ToDemote.push_back(E.Idx); return IsProfitableToDemote; }; + + if (E.State == TreeEntry::SplitVectorize) + return TryProcessInstruction( + BitWidth, + {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), + VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index c431b058f0d2d..92027d0043f76 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,15 +17,12 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] ; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 11fa3337544a1..749e50982cd3e 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -1,668 +1,268 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s +; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v | FileCheck %s ; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 | FileCheck %s --check-prefix=THR15 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[CONV1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]] -; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], splat (i32 16) -; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] -; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16) -; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]] -; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] -; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 -; CHECK-NEXT: [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]] -; CHECK-NEXT: [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16) -; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]] -; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; CHECK-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] -; CHECK-NEXT: [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; CHECK-NEXT: [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32> -; CHECK-NEXT: [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] -; CHECK-NEXT: [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16) -; CHECK-NEXT: [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]] -; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 -; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 -; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]] -; CHECK-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]] -; CHECK-NEXT: [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0 -; CHECK-NEXT: [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1 -; CHECK-NEXT: [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]] -; CHECK-NEXT: [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0 -; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0 -; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] -; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 -; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]] -; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32> -; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> -; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> -; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]] -; CHECK-NEXT: [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16) -; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> -; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> -; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]] -; CHECK-NEXT: [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16) -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]] -; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]] -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]] -; CHECK-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> -; CHECK-NEXT: [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]] -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0 -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]] -; CHECK-NEXT: [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0 -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]] -; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]] -; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15 -; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15 -; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> -; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32> -; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32> -; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]] -; CHECK-NEXT: [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16) -; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32> -; CHECK-NEXT: [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> -; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]] -; CHECK-NEXT: [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16) -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]] -; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]] -; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]] -; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]] -; CHECK-NEXT: [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]] -; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0 -; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]] -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]] -; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0 -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1 -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15 -; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15 -; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15) -; CHECK-NEXT: [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537) -; CHECK-NEXT: [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535) -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] -; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]] -; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]] -; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32> -; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0 -; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0 -; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]] -; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]] -; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> -; CHECK-NEXT: [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15) -; CHECK-NEXT: [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537) -; CHECK-NEXT: [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535) -; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0 -; CHECK-NEXT: [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0 -; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]] -; CHECK-NEXT: [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]] -; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> -; CHECK-NEXT: [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1 -; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]] -; CHECK-NEXT: [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 -; CHECK-NEXT: [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0 -; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]] -; CHECK-NEXT: [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0 -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]] -; CHECK-NEXT: [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]] -; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]] -; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]] -; CHECK-NEXT: [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15 -; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]] -; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]] -; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1 -; CHECK-NEXT: [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]] -; CHECK-NEXT: [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]] -; CHECK-NEXT: [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]] -; CHECK-NEXT: [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]] -; CHECK-NEXT: [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0 -; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 -; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> -; CHECK-NEXT: [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]] -; CHECK-NEXT: [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]] -; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]] -; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15) -; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537) -; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535) -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]] -; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]] -; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]] -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]] -; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1 -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP116:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]] +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; CHECK-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]] +; CHECK-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]] +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP116]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2) +; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; CHECK-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4) +; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; CHECK-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4) +; CHECK-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]] +; CHECK-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]] +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <16 x i32> [[TMP108]], <16 x i32> [[TMP109]], <16 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = lshr <16 x i32> [[TMP110]], splat (i32 15) +; CHECK-NEXT: [[TMP112:%.*]] = and <16 x i32> [[TMP111]], splat (i32 65537) +; CHECK-NEXT: [[TMP113:%.*]] = mul <16 x i32> [[TMP112]], splat (i32 65535) +; CHECK-NEXT: [[TMP114:%.*]] = add <16 x i32> [[TMP113]], [[TMP99]] +; CHECK-NEXT: [[TMP115:%.*]] = xor <16 x i32> [[TMP114]], [[TMP110]] +; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]]) ; CHECK-NEXT: ret i32 [[ADD113_3]] ; ; THR15-LABEL: define i32 @test( ; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; THR15-NEXT: entry: -; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; THR15-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 ; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32 ; THR15-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; THR15-NEXT: [[TMP2:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; THR15-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP2]] to i32 ; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; THR15-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; THR15-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 +; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 +; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 +; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; THR15-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; THR15-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; THR15-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; THR15-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; THR15-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; THR15-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; THR15-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; THR15-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; THR15-NEXT: [[TMP47:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; THR15-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP47]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP47]] +; THR15-NEXT: [[TMP49:%.*]] = sub <4 x i32> [[TMP14]], [[TMP47]] +; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP49]], <4 x i32> +; THR15-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; THR15-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; THR15-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; THR15-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; THR15-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; THR15-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; THR15-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; THR15-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; THR15-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]] +; THR15-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]] +; THR15-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]] +; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> ; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; THR15-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; THR15-NEXT: [[TMP87:%.*]] = zext i8 [[TMP6]] to i32 +; THR15-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]] +; THR15-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; THR15-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] ; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; THR15-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32> +; THR15-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32> ; THR15-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; THR15-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> -; THR15-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]] -; THR15-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], splat (i32 16) -; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]] -; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32> -; THR15-NEXT: [[TMP86:%.*]] = zext i8 [[TMP7]] to i32 -; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> -; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]] -; THR15-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> -; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]] -; THR15-NEXT: [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], splat (i32 16) -; THR15-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]] -; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]] -; THR15-NEXT: [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]] -; THR15-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0 -; THR15-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1 -; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]] -; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0 -; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1 -; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]] -; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1 -; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32> -; THR15-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 -; THR15-NEXT: [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32> -; THR15-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]] +; THR15-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP16]] to <4 x i32> +; THR15-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; THR15-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; THR15-NEXT: [[TMP62:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; THR15-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP62]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP62]] +; THR15-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP62]] +; THR15-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; THR15-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> ; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; THR15-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> -; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]] -; THR15-NEXT: [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], splat (i32 16) -; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]] -; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; THR15-NEXT: [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> -; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; THR15-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> -; THR15-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] -; THR15-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; THR15-NEXT: [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; THR15-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] -; THR15-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16) -; THR15-NEXT: [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] -; THR15-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 -; THR15-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 -; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] -; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0 -; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1 -; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; THR15-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] -; THR15-NEXT: [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0 -; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0 -; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]] -; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0 -; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0 -; THR15-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] -; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]] -; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]] -; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15 -; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15 -; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]] -; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]] -; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15 -; THR15-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; THR15-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; THR15-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15 -; THR15-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; THR15-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; THR15-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0 -; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1 -; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]] -; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; THR15-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]] -; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]] -; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15 -; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; THR15-NEXT: [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; THR15-NEXT: [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32> -; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; THR15-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; THR15-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> -; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> -; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]] -; THR15-NEXT: [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], splat (i32 16) -; THR15-NEXT: [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> -; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; THR15-NEXT: [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]] -; THR15-NEXT: [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], splat (i32 16) -; THR15-NEXT: [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1 -; THR15-NEXT: [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]] -; THR15-NEXT: [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]] -; THR15-NEXT: [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0 -; THR15-NEXT: [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]] -; THR15-NEXT: [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]] -; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> -; THR15-NEXT: [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]] -; THR15-NEXT: [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]] -; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0 -; THR15-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1 -; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]] -; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]] -; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0 -; THR15-NEXT: [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1 -; THR15-NEXT: [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]] -; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]] -; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15 -; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15 -; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; THR15-NEXT: [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; THR15-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32> -; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; THR15-NEXT: [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32> -; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; THR15-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; THR15-NEXT: [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32> -; THR15-NEXT: [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]] -; THR15-NEXT: [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], splat (i32 16) -; THR15-NEXT: [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32> -; THR15-NEXT: [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32> -; THR15-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> -; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]] -; THR15-NEXT: [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16) -; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1 -; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]] -; THR15-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]] -; THR15-NEXT: [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0 -; THR15-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]] -; THR15-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]] -; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]] -; THR15-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]] -; THR15-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0 -; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1 -; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]] -; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]] -; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 -; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1 -; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]] -; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]] -; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15 -; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15 -; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; THR15-NEXT: [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], splat (i32 15) -; THR15-NEXT: [[TMP157:%.*]] = and <2 x i32> [[TMP156]], splat (i32 65537) -; THR15-NEXT: [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], splat (i32 65535) -; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]] -; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]] -; THR15-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] -; THR15-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; THR15-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; THR15-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] -; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]] -; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]] -; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]] -; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]] -; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; THR15-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]] -; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] -; THR15-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] -; THR15-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] -; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; THR15-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; THR15-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]] -; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]] -; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]] -; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]] -; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] -; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] -; THR15-NEXT: [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; THR15-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 -; THR15-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> -; THR15-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]] -; THR15-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]] -; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] -; THR15-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; THR15-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]] -; THR15-NEXT: [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]] -; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15 -; THR15-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; THR15-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]] -; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; THR15-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0 -; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]] -; THR15-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1 -; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]] -; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]] -; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]] -; THR15-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; THR15-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; THR15-NEXT: [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> -; THR15-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] -; THR15-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] -; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]] -; THR15-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]] -; THR15-NEXT: [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], splat (i32 15) -; THR15-NEXT: [[TMP180:%.*]] = and <2 x i32> [[TMP179]], splat (i32 65537) -; THR15-NEXT: [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], splat (i32 65535) -; THR15-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]] -; THR15-NEXT: [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]] -; THR15-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; THR15-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; THR15-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0 -; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]] -; THR15-NEXT: [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1 -; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]] -; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; THR15-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; THR15-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; THR15-NEXT: [[TMP155:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 +; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; THR15-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP53]], i64 2) +; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; THR15-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; THR15-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; THR15-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; THR15-NEXT: [[TMP165:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; THR15-NEXT: [[TMP166:%.*]] = add <4 x i32> [[TMP165]], [[TMP155]] +; THR15-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP166]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP166]], [[TMP79]] +; THR15-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP166]], [[TMP79]] +; THR15-NEXT: [[TMP222:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; THR15-NEXT: [[TMP217:%.*]] = shufflevector <4 x i32> [[TMP222]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP223:%.*]] = add <4 x i32> [[TMP222]], [[TMP217]] +; THR15-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP222]], [[TMP217]] +; THR15-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP223]], <4 x i32> [[TMP85]], <4 x i32> +; THR15-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; THR15-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; THR15-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP91]], i64 4) +; THR15-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; THR15-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; THR15-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP87]], i64 4) +; THR15-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP90]], [[TMP94]] +; THR15-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP94]], [[TMP90]] +; THR15-NEXT: [[TMP102:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> +; THR15-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP105:%.*]] = shufflevector <16 x i32> [[TMP103]], <16 x i32> [[TMP104]], <16 x i32> +; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP107:%.*]] = shufflevector <16 x i32> [[TMP105]], <16 x i32> [[TMP106]], <16 x i32> +; THR15-NEXT: [[TMP108:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP109:%.*]] = shufflevector <16 x i32> [[TMP107]], <16 x i32> [[TMP108]], <16 x i32> +; THR15-NEXT: [[TMP110:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP111:%.*]] = shufflevector <16 x i32> [[TMP109]], <16 x i32> [[TMP110]], <16 x i32> +; THR15-NEXT: [[TMP112:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP113:%.*]] = shufflevector <16 x i32> [[TMP111]], <16 x i32> [[TMP112]], <16 x i32> +; THR15-NEXT: [[TMP114:%.*]] = lshr <16 x i32> [[TMP113]], splat (i32 15) +; THR15-NEXT: [[TMP115:%.*]] = and <16 x i32> [[TMP114]], splat (i32 65537) +; THR15-NEXT: [[TMP116:%.*]] = mul <16 x i32> [[TMP115]], splat (i32 65535) +; THR15-NEXT: [[TMP117:%.*]] = add <16 x i32> [[TMP116]], [[TMP102]] +; THR15-NEXT: [[TMP118:%.*]] = xor <16 x i32> [[TMP117]], [[TMP113]] +; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]]) ; THR15-NEXT: ret i32 [[ADD113_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 5b0f4a69de4c3..1ea41a18330dd 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1024,10 +1024,8 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index e24c52ba81ddf..b374e877bb38a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -1,16 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -42,9 +44,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -75,11 +79,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -126,9 +158,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { define <8 x i32> @sext_zext(<8 x i16> %a) { ; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 0f8751a6da7f5..ddd3dffaafcc5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -1,16 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -42,9 +44,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -75,11 +79,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -126,9 +158,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { define <8 x i32> @sext_zext(<8 x i16> %a) { ; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll index 5a1de4f3e3d7f..5cee6984df04f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,17 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fadd_fsub_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fmul_fdiv_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX-NEXT: ret <4 x float> [[TMP1]] ; +; AVX2-LABEL: @fmul_fdiv_v4f32_const( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: ret <4 x float> [[TMP1]] +; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( ; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX512-NEXT: ret <4 x float> [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll index 046ed781f4c8d..9a2f959ac63bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,17 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fadd_fsub_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX2-LABEL: @fmul_fdiv_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX-NEXT: ret <4 x float> [[TMP1]] ; +; AVX2-LABEL: @fmul_fdiv_v4f32_const( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: ret <4 x float> [[TMP1]] +; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( ; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX512-NEXT: ret <4 x float> [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index 8839fc2281788..f8c5df9944538 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index dfa918a6ea453..b84ef027f67c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index b659c10bb2fbf..7ed5f33c9dc6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -7,7 +7,7 @@ define void @test() { ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]] @@ -16,6 +16,8 @@ define void @test() { ; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll index 48b04201d1acc..e42e6183b8cae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll @@ -15,8 +15,8 @@ define ptr @test(ptr %0, ptr %args_gep) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[ARG26]], i64 17 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]] ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll index 9fc2b7d6e7865..70c67ff251d6d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll @@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> ; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index c3122d991da20..faaac0c7614f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -286,8 +286,8 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index cfbfd0ebc37bc..ea497c95d4114 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -10,22 +10,24 @@ define i32 @bar() local_unnamed_addr { ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[SUB102_1]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> , <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[SUB102_1]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[SUB86_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8) ; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15) ; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537) ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535) -; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]] ; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]]) ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll index 2f49a2e6a212e..e9a65bf6d6f0d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll @@ -6,11 +6,11 @@ define i1 @foo() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0 ; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0 -; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL_NOT_NOT509_I_1_2_I_I]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> [[TMP2]], <2 x i1> zeroinitializer, i64 6) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[STOREMERGE_2333_I_I]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index df85656800aac..17ae33652b6d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -153,8 +153,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> @@ -283,7 +283,7 @@ define void @test(ptr %i1, ptr %i2, ptr %o, i1 %arg) { ; CHECK-NEXT: [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16 ; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1 ; CHECK-NEXT: [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16 -; CHECK-NEXT: br i1 %arg, label [[THEN:%.*]], label [[END:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16 ; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll index 787bd39759dc7..b4e66138578df 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll @@ -103,10 +103,10 @@ define void @test2(ptr %p1, ptr %p2) { ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> , [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x double> [ [[TMP11]], [[BB1]] ], [ [[TMP16:%.*]], [[BB6:%.*]] ] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15:%.*]], [[BB6:%.*]] ] ; CHECK-NEXT: [[X0:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, ptr [[X0]], align 8 ; CHECK-NEXT: br i1 poison, label [[BB3:%.*]], label [[BB6]] @@ -117,8 +117,7 @@ define void @test2(ptr %p1, ptr %p2) { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB6]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ] -; CHECK-NEXT: [[TMP16]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP15]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll index 9682567b173c3..cda88620ab88a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -11,9 +11,9 @@ define void @test() { ; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll index 8a017a397cff9..3b9222b7d5ed1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-10 < %s | FileCheck %s +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-4 < %s | FileCheck %s define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-LABEL: define i32 @test( @@ -13,7 +13,7 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-NEXT: br i1 false, label %[[D_EXIT_3]], label %[[D_EXIT_6:.*]] ; CHECK: [[D_EXIT_3]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[ENTRY]] ], [ poison, %[[IF_END_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[RETVAL_0_I_219]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> , i32 [[RETVAL_0_I_219]], i32 0 ; CHECK-NEXT: br i1 [[TOBOOL_I_4]], label %[[D_EXIT_4:.*]], label %[[D_EXIT_6]] ; CHECK: [[D_EXIT_4]]: ; CHECK-NEXT: br label %[[D_EXIT_6]] @@ -21,25 +21,29 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK-NEXT: br i1 false, label %[[D_EXIT_6]], label %[[D_EXIT_7:.*]] ; CHECK: [[D_EXIT_6]]: ; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] ; CHECK-NEXT: br label %[[D_EXIT_7]] ; CHECK: [[D_EXIT_7]]: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[RETVAL_0_I_219]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i32> [[TMP11]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> , i32 [[RETVAL_0_I_219]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3 ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP21]], <4 x i32> [[TMP10]], i64 4) +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]] +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP19]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> [[TMP20]], [[TMP16]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP19]], <4 x i32> [[RDX_OP]], i64 0) ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP16]]) -; CHECK-NEXT: [[OP_RDX4:%.*]] = or i32 [[TMP18]], [[TMP17]] -; CHECK-NEXT: ret i32 [[OP_RDX4]] +; CHECK-NEXT: ret i32 [[TMP17]] ; entry: %0 = load i32, ptr %f, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index c01c44ff03c15..1294a87ff6967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -7,20 +7,14 @@ define void @test(i1 %c, ptr %arg) { ; CHECK: if: ; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll index 33fa00c1881da..38e9ba7ce7028 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll @@ -6,23 +6,19 @@ define i32 @a() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]] -; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4 -; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4 ; CHECK-NEXT: br label %[[BB1]] ; br label %1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 0ed12760b563f..e3a6020a542fb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index f47373747e578..cea98bf55b6ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll index d650a972ad8ca..7060288d739bd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll @@ -29,8 +29,8 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] diff --git a/llvm/test/Transforms/SLPVectorizer/addsub.ll b/llvm/test/Transforms/SLPVectorizer/addsub.ll index 3961250d56451..6814bc0f566f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/addsub.ll @@ -387,14 +387,10 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32>