From b0e4d402159778e38e08b7442607f1c0a9c7c66f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 17 Jan 2025 15:58:02 +0000 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 589 +++++++++++- .../PhaseOrdering/AArch64/slpordering.ll | 178 ++-- .../AArch64/gather-with-minbith-user.ll | 6 +- .../SLPVectorizer/AArch64/loadorder.ll | 160 ++-- .../SLPVectorizer/AArch64/tsc-s116.ll | 13 +- .../SLPVectorizer/RISCV/complex-loads.ll | 848 +++++------------- .../SLPVectorizer/RISCV/reductions.ll | 6 +- .../X86/alternate-cast-inseltpoison.ll | 74 +- .../SLPVectorizer/X86/alternate-cast.ll | 74 +- .../X86/alternate-fp-inseltpoison.ll | 80 +- .../SLPVectorizer/X86/alternate-fp.ll | 80 +- .../X86/alternate-int-inseltpoison.ll | 106 ++- .../SLPVectorizer/X86/alternate-int.ll | 106 ++- .../X86/buildvector-schedule-for-subvector.ll | 4 +- .../SLPVectorizer/X86/long-full-reg-stores.ll | 6 +- .../X86/matched-shuffled-entries.ll | 29 +- .../X86/non-load-reduced-as-part-of-bv.ll | 10 +- .../X86/scatter-vectorize-reused-pointer.ll | 10 +- .../X86/splat-score-adjustment.ll | 22 +- llvm/test/Transforms/SLPVectorizer/addsub.ll | 12 +- .../resized-alt-shuffle-after-minbw.ll | 35 +- 21 files changed, 1450 insertions(+), 998 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b0b8f8249d657..59063d6b4c9bc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1461,6 +1461,7 @@ class BoUpSLP { VectorizableTree.clear(); ScalarToTreeEntry.clear(); MultiNodeScalars.clear(); + ScalarsInSplitNodes.clear(); MustGather.clear(); NonScheduledFirst.clear(); EntryToLastInstruction.clear(); @@ -3196,12 +3197,30 @@ class BoUpSLP { /// \returns Common mask for reorder indices and reused scalars. SmallVector getCommonMask() const { + if (State == TreeEntry::SplitVectorize) + return {}; SmallVector Mask; inversePermutation(ReorderIndices, Mask); ::addMask(Mask, ReuseShuffleIndices); return Mask; } + /// \returns The mask for split nodes. + SmallVector getSplitMask() const { + assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() && + "Expected only split vectorize node."); + SmallVector Mask(getVectorFactor(), PoisonMaskElem); + unsigned CommonVF = std::max( + CombinedEntriesWithIndices.back().second, + Scalars.size() - CombinedEntriesWithIndices.back().second); + for (auto [Idx, I] : enumerate(ReorderIndices)) + Mask[I] = + Idx + (Idx >= CombinedEntriesWithIndices.back().second + ? CommonVF - CombinedEntriesWithIndices.back().second + : 0); + return Mask; + } + /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { auto &&IsSame = [VL](ArrayRef Scalars, ArrayRef Mask) { @@ -3293,6 +3312,8 @@ class BoUpSLP { ///< complex node like select/cmp to minmax, mul/add to ///< fma, etc. Must be used for the following nodes in ///< the pattern, not the very first one. + SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them + ///< independently and then combines back. }; EntryState State; @@ -3324,7 +3345,7 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. unsigned Idx = 0; - /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// For gather/buildvector/alt opcode nodes, which are combined from /// other nodes as a series of insertvector instructions. SmallVector, 2> CombinedEntriesWithIndices; @@ -3471,8 +3492,9 @@ class BoUpSLP { SmallVectorImpl *AltScalars = nullptr) const; /// Return true if this is a non-power-of-2 node. - bool isNonPowOf2Vec() const { - bool IsNonPowerOf2 = !has_single_bit(Scalars.size()); + bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const { + bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2( + TTI, getValueType(Scalars.front()), Scalars.size()); return IsNonPowerOf2; } @@ -3530,6 +3552,9 @@ class BoUpSLP { case CombinedVectorize: dbgs() << "CombinedVectorize\n"; break; + case SplitVectorize: + dbgs() << "SplitVectorize\n"; + break; } dbgs() << "MainOp: "; if (MainOp) @@ -3611,8 +3636,10 @@ class BoUpSLP { const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = {}, ArrayRef ReorderIndices = {}) { - assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || - (Bundle && EntryState != TreeEntry::NeedToGather)) && + assert(((!Bundle && (EntryState == TreeEntry::NeedToGather || + EntryState == TreeEntry::SplitVectorize)) || + (Bundle && EntryState != TreeEntry::NeedToGather && + EntryState != TreeEntry::SplitVectorize)) && "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && @@ -3646,12 +3673,29 @@ class BoUpSLP { return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - if (S) + if (S) { Last->setOperations(S); + } else if (EntryState == TreeEntry::SplitVectorize) { + auto *MainOp = + cast(*find_if(Last->Scalars, IsaPred)); + auto *AltOp = cast(*find_if(Last->Scalars, [=](Value *V) { + auto *I = dyn_cast(V); + return I && I->getOpcode() != MainOp->getOpcode(); + })); + Last->setOperations(InstructionsState(MainOp, AltOp)); + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + ScalarsInSplitNodes.try_emplace(I, Last); + } + } Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } - if (!Last->isGather()) { + if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) { for (Value *V : VL) { + if (isa(V)) + continue; const TreeEntry *TE = getTreeEntry(V); assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && "Scalar already in tree!"); @@ -3679,7 +3723,7 @@ class BoUpSLP { } } assert(!BundleMember && "Bundle and VL out of sync"); - } else { + } else if (Last->isGather()) { // Build a map for gathered scalars to the nodes where they are used. bool AllConstsOrCasts = true; for (Value *V : VL) @@ -3745,6 +3789,9 @@ class BoUpSLP { /// nodes. SmallDenseMap> MultiNodeScalars; + /// Scalars, used in split vectorize nodes. + SmallDenseMap ScalarsInSplitNodes; + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; @@ -5648,12 +5695,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { }) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; - if ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && - (isa(TE.getMainOp()) || - (TopToBottom && isa(TE.getMainOp())))) { - assert(!TE.isAltShuffle() && "Alternate instructions are only supported by " - "BinaryOperator and CastInst."); + if (TE.State == TreeEntry::SplitVectorize || + ((TE.State == TreeEntry::Vectorize || + TE.State == TreeEntry::StridedVectorize) && + (isa(TE.getMainOp()) || + (TopToBottom && isa(TE.getMainOp()))))) { + assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && + "Alternate instructions are only supported by " + "BinaryOperator and CastInst."); return TE.ReorderIndices; } if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { @@ -5938,7 +5987,7 @@ void BoUpSLP::reorderTopToBottom() { // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. - if (TE->isAltShuffle()) { + if (TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) { VectorType *VecTy = getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); @@ -5976,7 +6025,8 @@ void BoUpSLP::reorderTopToBottom() { } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -5985,6 +6035,30 @@ void BoUpSLP::reorderTopToBottom() { } }); + auto UpdateSplitUserNode = [&](TreeEntry *UserTE, unsigned Idx, + ArrayRef Mask, ArrayRef MaskOrder) { + assert(UserTE->State == TreeEntry::SplitVectorize && + "Expected split user node."); + SmallVector NewMask(UserTE->getVectorFactor()); + SmallVector NewMaskOrder(UserTE->getVectorFactor()); + std::iota(NewMask.begin(), NewMask.end(), 0); + std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0); + if (Idx == 0) { + copy(Mask, NewMask.begin()); + copy(MaskOrder, NewMaskOrder.begin()); + } else { + assert(Idx == 1 && "Expected either 0 or 1 index."); + unsigned Offset = UserTE->CombinedEntriesWithIndices.back().second; + for (unsigned I : seq(Mask.size())) { + NewMask[I + Offset] = Mask[I] + Offset; + NewMaskOrder[I + Offset] = MaskOrder[I] + Offset; + } + } + reorderScalars(UserTE->Scalars, NewMask); + reorderOrder(UserTE->ReorderIndices, NewMaskOrder, /*BottomOrder=*/true); + if (isIdentityOrder(UserTE->ReorderIndices)) + UserTE->ReorderIndices.clear(); + }; // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) { @@ -6007,7 +6081,8 @@ void BoUpSLP::reorderTopToBottom() { for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) && + OpTE->State != TreeEntry::SplitVectorize) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, @@ -6114,6 +6189,8 @@ void BoUpSLP::reorderTopToBottom() { // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { + assert(TE->State != TreeEntry::SplitVectorize && + "Split vectorized not expected."); // Need to reorder the reuses masks of the operands with smaller VF to // be able to find the match between the graph nodes and scalar // operands of the given node during vectorization/cost estimation. @@ -6121,7 +6198,8 @@ void BoUpSLP::reorderTopToBottom() { [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == - TE->Scalars.size(); + TE->Scalars.size() || + EI.UserTE->State == TreeEntry::SplitVectorize; }) && "All users must be of VF size."); if (SLPReVec) { @@ -6144,19 +6222,29 @@ void BoUpSLP::reorderTopToBottom() { // Update ordering of the operands with the smaller VF than the given // one. reorderNodeWithReuses(*TE, Mask); + // Update orders in user split vectorize nodes. + for (EdgeInfo &EI : TE->UserTreeIndices) { + if (EI.UserTE->State != TreeEntry::SplitVectorize) + continue; + UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder); + } } continue; } - if ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && - (isa(TE->getMainOp()) || - (SLPReVec && isa(TE->getMainOp())))) { - assert(!TE->isAltShuffle() && - "Alternate instructions are only supported by BinaryOperator " - "and CastInst."); - // Build correct orders for extract{element,value}, loads and - // stores. + if ((TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty()) || + ((TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::StridedVectorize) && + (isa(TE->getMainOp()) || + (SLPReVec && isa(TE->getMainOp()))))) { + assert( + (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize && + TE->ReuseShuffleIndices.empty())) && + "Alternate instructions are only supported by BinaryOperator " + "and CastInst."); + // Build correct orders for extract{element,value}, loads, + // stores and alternate (split) nodes. reorderOrder(TE->ReorderIndices, Mask); if (isa(TE->getMainOp())) TE->reorderOperands(Mask); @@ -6177,6 +6265,12 @@ void BoUpSLP::reorderTopToBottom() { addMask(NewReuses, TE->ReuseShuffleIndices); TE->ReuseShuffleIndices.swap(NewReuses); } + // Update orders in user split vectorize nodes. + for (EdgeInfo &EI : TE->UserTreeIndices) { + if (EI.UserTE->State != TreeEntry::SplitVectorize) + continue; + UpdateSplitUserNode(EI.UserTE, EI.EdgeIdx, Mask, MaskOrder); + } } } } @@ -6189,7 +6283,8 @@ bool BoUpSLP::canReorderOperands( if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || - OpData.second->State == TreeEntry::StridedVectorize); + OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::SplitVectorize); })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { @@ -6207,6 +6302,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); continue; @@ -6216,6 +6312,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (any_of(TE->UserTreeIndices, [UserTE, I](const EdgeInfo &EI) { @@ -6245,13 +6342,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector NonVectorized; for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::StridedVectorize) + TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) || + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); } @@ -6270,6 +6369,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || !all_of(drop_begin(TE->UserTreeIndices), @@ -6295,6 +6395,51 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Data1.first->Idx > Data2.first->Idx; }); for (auto &Data : UsersVec) { + if (Data.first->State == TreeEntry::SplitVectorize) { + assert( + Data.second.size() <= 2 && + "Expected not greater than 2 operands for split vectorize node."); + if (any_of(Data.second, [](const auto &Op) { + return Op.second->UserTreeIndices.size() != 1; + })) + continue; + // Update orders in user split vectorize nodes. + for (const auto &P : Data.first->CombinedEntriesWithIndices) { + TreeEntry &OpTE = *VectorizableTree[P.first].get(); + if (OpTE.isGather() || OpTE.ReorderIndices.empty()) + continue; + SmallVector Mask; + inversePermutation(OpTE.ReorderIndices, Mask); + SmallVector MaskOrder(OpTE.ReorderIndices.size(), + PoisonMaskElem); + unsigned E = OpTE.ReorderIndices.size(); + transform(OpTE.ReorderIndices, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : PoisonMaskElem; + }); + SmallVector NewMask(Data.first->getVectorFactor()); + SmallVector NewMaskOrder(Data.first->getVectorFactor()); + std::iota(NewMask.begin(), NewMask.end(), 0); + std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0); + if (P.second == 0) { + copy(Mask, NewMask.begin()); + copy(MaskOrder, NewMaskOrder.begin()); + } else { + unsigned Offset = P.second; + for (unsigned I : seq(Mask.size())) { + NewMask[I + Offset] = Mask[I] + Offset; + NewMaskOrder[I + Offset] = MaskOrder[I] + Offset; + } + } + reorderScalars(Data.first->Scalars, NewMask); + reorderOrder(Data.first->ReorderIndices, NewMaskOrder, + /*BottomOrder=*/true); + if (isIdentityOrder(Data.first->ReorderIndices)) + Data.first->ReorderIndices.clear(); + // Clear ordering of the operand. + OpTE.ReorderIndices.clear(); + } + continue; + } // Check that operands are used only in the User node. SmallVector GatherOps; if (!canReorderOperands(Data.first, Data.second, NonVectorized, @@ -6451,6 +6596,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -6521,7 +6667,7 @@ void BoUpSLP::buildExternalUses( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; // For each lane: @@ -8227,6 +8373,142 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // Tries to build split node. + auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize, + const InstructionsState &LocalState) { + if (VL.size() <= SmallNodeSize) + return false; + + // Any value is used in split node already - just gather. + if (any_of(VL, [&](Value *V) { + return ScalarsInSplitNodes.contains(V) || + ScalarToTreeEntry.contains(V); + })) { + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return true; + } + SmallVector Op1, Op2; + OrdersType ReorderIndices(VL.size(), VL.size()); + SmallBitVector Op1Indices(VL.size()); + for (auto [Idx, V] : enumerate(VL)) { + auto *I = dyn_cast(V); + if (!I) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI); + if (NewS && !NewS.isAltShuffle()) { + Op1.push_back(V); + Op1Indices.set(Idx); + continue; + } + Op2.push_back(V); + } + Type *ScalarTy = getValueType(VL.front()); + VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); + unsigned Opcode0 = LocalState.getOpcode(); + unsigned Opcode1 = LocalState.getAltOpcode(); + SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); + // Enable split node, only if all nodes are power-of-2/full registers and + // do not form legal alternate instruction (like X86 addsub). + SmallPtrSet UOp1(Op1.begin(), Op1.end()); + SmallPtrSet UOp2(Op2.begin(), Op2.end()); + if (UOp1.size() <= 1 || UOp2.size() <= 1 || + TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) || + !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), UOp1.size()) || + !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), UOp2.size())) + return false; + unsigned Op1Cnt = 0, Op2Cnt = Op1.size(); + for (unsigned Idx : seq(VL.size())) { + if (Op1Indices.test(Idx)) { + ReorderIndices[Op1Cnt] = Idx; + ++Op1Cnt; + } else { + ReorderIndices[Op2Cnt] = Idx; + ++Op2Cnt; + } + } + if (isIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + SmallVector Mask; + if (!ReorderIndices.empty()) + inversePermutation(ReorderIndices, Mask); + unsigned NumParts = TTI.getNumberOfParts(VecTy); + VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size()); + // Check non-profitable single register ops, which better to be represented + // as alternate ops. + if (NumParts >= VL.size()) + return false; + if (NumParts <= 1 && LocalState.getMainOp()->isBinaryOp() && + LocalState.getAltOp()->isBinaryOp()) { + bool AreShifts = + LocalState.getMainOp()->isShift() && LocalState.getAltOp()->isShift(); + bool AreBitwiseLogics = LocalState.getMainOp()->isBitwiseLogicOp() && + LocalState.getAltOp()->isBitwiseLogicOp(); + constexpr std::array AddSub = { + Instruction::Add, Instruction::Sub, Instruction::FAdd, + Instruction::FSub}; + constexpr std::array MulDiv = { + Instruction::Mul, Instruction::FMul, Instruction::SDiv, + Instruction::UDiv, Instruction::FDiv, Instruction::SRem, + Instruction::URem, Instruction::FRem}; + bool AreAddSubs = !AreShifts && !AreBitwiseLogics && + is_contained(AddSub, Opcode0) && + is_contained(AddSub, Opcode1); + bool AreMulDivs = !AreShifts && !AreBitwiseLogics && !AreAddSubs && + is_contained(MulDiv, Opcode0) && + is_contained(MulDiv, Opcode1); + if ((AreShifts || AreBitwiseLogics || AreAddSubs || AreMulDivs) && + (Mask.empty() || + ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, {}, + TTI::TCK_RecipThroughput, Op1.size(), Op2VecTy) >= + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, VecTy, Mask, + TTI::TCK_RecipThroughput))) + return false; + } + SmallVector NewVL(VL.size()); + copy(Op1, NewVL.begin()); + copy(Op2, std::next(NewVL.begin(), Op1.size())); + auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt, + LocalState, UserTreeIdx, {}, ReorderIndices); + LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump()); + auto AddNode = [&](ArrayRef Op, unsigned Idx) { + auto FindPrevEntry = [&](ArrayRef VL) -> TreeEntry * { + TreeEntry *SE = getTreeEntry(VL.front()); + if (!SE) + return nullptr; + if (SE->isSame(VL)) + return SE; + for (TreeEntry *SE : MultiNodeScalars.lookup(VL.front())) { + if (SE->isSame(VL)) + return SE; + } + return nullptr; + }; + if (TreeEntry *PrevSE = FindPrevEntry(Op)) { + TE->CombinedEntriesWithIndices.emplace_back(PrevSE->Idx, + Idx == 0 ? 0 : Op1.size()); + PrevSE->UserTreeIndices.emplace_back(TE, Idx); + } else if (isa(Op.front())) { + // Build gather node for loads, they will be gathered later. + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, + getSameOpcode(Op, *TLI), {TE, Idx}); + } else { + TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(), + Idx == 0 ? 0 : Op1.size()); + buildTree_rec(Op, Depth, {TE, Idx}); + } + }; + AddNode(Op1, 0); + AddNode(Op2, 1); + return true; + }; + // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. @@ -8312,6 +8594,48 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { + if (!S) { + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + for (Value *V : VL) { + if (isa(V)) + continue; + auto *I = dyn_cast(V); + if (!I) { + MainOp = AltOp = nullptr; + break; + } + if (!MainOp) { + MainOp = I; + continue; + } + if (MainOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != MainOp->getParent()) { + MainOp = AltOp = nullptr; + break; + } + continue; + } + if (!AltOp) { + AltOp = I; + continue; + } + if (AltOp->getOpcode() == I->getOpcode()) { + if (I->getParent() != AltOp->getParent()) { + MainOp = AltOp = nullptr; + break; + } + continue; + } + MainOp = AltOp = nullptr; + break; + } + // Last chance to try to vectorize alternate node. + constexpr unsigned SmallNodeSize = 4; + if (MainOp && AltOp && + TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp))) + return; + } LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -8391,6 +8715,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // FIXME: investigate if there are profitable cases for VL.size() <= 4. + constexpr unsigned SmallNodeSize = 4; + if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S)) + return; + // Check that every instruction appears once in this bundle. if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; @@ -8423,6 +8752,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); + // Last chance to try to vectorize alternate node. + constexpr unsigned SmallNodeSize = 4; + if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S)) + return; newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); NonScheduledFirst.insert(VL.front()); @@ -8567,6 +8900,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, TE->dump()); break; case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected loads state."); } @@ -9771,6 +10105,12 @@ void BoUpSLP::transformNodes() { reorderGatherNode(E); } + bool ForceLoadGather = + count_if(VectorizableTree, [](const std::unique_ptr &TE) { + return TE->isGather() && TE->getOpcode() == Instruction::Load && + TE->getVectorFactor() < 16; + }) == 2; + // The tree may grow here, so iterate over nodes, built before. for (unsigned Idx : seq(BaseGraphSize)) { TreeEntry &E = *VectorizableTree[Idx]; @@ -9785,6 +10125,49 @@ void BoUpSLP::transformNodes() { E.isAltShuffle() || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; + if (ForceLoadGather && E.getOpcode() == Instruction::Load) + continue; + auto AreReusedScalars = + [&](const TreeEntry *TE, + function_ref CheckContainer) { + return TE->isSame(VL) || all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + auto *I = dyn_cast(V); + return I && CheckContainer(I, TE); + }); + }; + if (E.getOpcode()) { + if (const TreeEntry *TE = getTreeEntry(E.getMainOp())) { + if (AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) { + return ScalarToTreeEntry.lookup(V) == TE; + })) + continue; + auto It = MultiNodeScalars.find(E.getMainOp()); + if (It != MultiNodeScalars.end() && + any_of(It->getSecond(), [&](const TreeEntry *TE) { + return AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) { + return is_contained(MultiNodeScalars.lookup(V), TE); + }); + })) + continue; + } + if (const TreeEntry *TE = ScalarsInSplitNodes.lookup(E.getMainOp())) + if (AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) { + return ScalarsInSplitNodes.lookup(V) == TE; + })) + continue; + } else { + // Check if the gather node full copy of split node. + auto *It = find_if(VL, IsaPred); + if (It != VL.end()) { + if (const TreeEntry *TE = ScalarsInSplitNodes.lookup(*It)) + if (AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) { + return ScalarsInSplitNodes.lookup(V) == TE; + })) + continue; + } + } // Try to find vectorizable sequences and transform them into a series of // insertvector instructions. unsigned StartIdx = 0; @@ -10038,7 +10421,7 @@ void BoUpSLP::transformNodes() { (VectorizableTree.size() <= 2 && UserIgnoreList)) return; - if (VectorizableTree.front()->isNonPowOf2Vec() && + if (VectorizableTree.front()->isNonPowOf2Vec(*TTI) && getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && getCanonicalGraphSize() <= SmallTree && count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), @@ -11025,10 +11408,9 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, return VE; const auto *It = find_if(VectorizableTree, [&](const std::unique_ptr &TE) { - return TE->isGather() && - find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.EdgeIdx == Idx && EI.UserTE == E; - }) != TE->UserTreeIndices.end(); + return (TE->isGather() || TE->State == TreeEntry::SplitVectorize) && + is_contained(TE->UserTreeIndices, + EdgeInfo(const_cast(E), Idx)); }); assert(It != VectorizableTree.end() && "Expected vectorizable entry."); return It->get(); @@ -11108,6 +11490,32 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return processBuildVector( E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask."); + InstructionCost VectorCost = 0; + if (E->ReorderIndices.empty()) { + VectorCost = ::getShuffleCost( + *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind, + E->CombinedEntriesWithIndices.back().second, + getWidenedType( + ScalarTy, + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor())); + } else { + unsigned CommonVF = + std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first] + ->getVectorFactor(), + VectorizableTree[E->CombinedEntriesWithIndices.back().first] + ->getVectorFactor()); + VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonVF), + E->getSplitMask(), CostKind); + } + LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree")); + return VectorCost; + } InstructionCost CommonCost = 0; SmallVector Mask; if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || @@ -11191,7 +11599,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, EI.EdgeIdx != 0) { auto UserBWIt = MinBWs.find(EI.UserTE); Type *UserScalarTy = - EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); + EI.UserTE->State == TreeEntry::SplitVectorize + ? EI.UserTE->Scalars.front()->getType() + : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); if (UserBWIt != MinBWs.end()) UserScalarTy = IntegerType::get(ScalarTy->getContext(), UserBWIt->second.first); @@ -11680,6 +12090,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, break; } case TreeEntry::CombinedVectorize: + case TreeEntry::SplitVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected vectorization state."); } @@ -12134,7 +12545,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { bool BoUpSLP::isTreeNotExtendable() const { if (getCanonicalGraphSize() != getTreeSize()) { constexpr unsigned SmallTree = 3; - if (VectorizableTree.front()->isNonPowOf2Vec() && + if (VectorizableTree.front()->isNonPowOf2Vec(*TTI) && getCanonicalGraphSize() <= SmallTree && count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), [](const std::unique_ptr &TE) { @@ -12148,6 +12559,8 @@ bool BoUpSLP::isTreeNotExtendable() const { bool Res = false; for (unsigned Idx : seq(getTreeSize())) { TreeEntry &E = *VectorizableTree[Idx]; + if (E.State == TreeEntry::SplitVectorize) + return false; if (!E.isGather()) continue; if (E.getOpcode() && E.getOpcode() != Instruction::Load) @@ -12460,7 +12873,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); continue; } - if (TE.isGather()) { + if (TE.isGather() || TE.State == TreeEntry::SplitVectorize) { if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); E && E->getVectorFactor() == TE.getVectorFactor() && E->isSame(TE.Scalars)) { @@ -13598,7 +14011,7 @@ BoUpSLP::isGatherShuffledEntry( }))) return {}; // FIXME: Gathering for non-power-of-2 nodes not implemented yet. - if (TE->isNonPowOf2Vec()) + if (TE->isNonPowOf2Vec(*TTI)) return {}; Mask.assign(VL.size(), PoisonMaskElem); assert((TE->UserTreeIndices.size() == 1 || @@ -13732,6 +14145,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { assert(((GatheredLoadsEntriesFirst.has_value() && E->getOpcode() == Instruction::Load && E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) || + E->State == TreeEntry::SplitVectorize || all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && @@ -13757,6 +14171,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { } assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || + E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() && @@ -13818,6 +14233,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return FirstInst; }; + if (E->State == TreeEntry::SplitVectorize) { + Res = FindLastInst(); + return *Res; + } + // Set insertpoint for gathered loads to the very first load. if (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && @@ -14746,12 +15166,15 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, // Find the corresponding gather entry and vectorize it. // Allows to be more accurate with tree/graph transformations, checks for the // correctness of the transformations in many cases. - auto *I = find_if(VectorizableTree, - [E, NodeIdx](const std::unique_ptr &TE) { - return TE->isOperandGatherNode({E, NodeIdx}); - }); - assert(I != VectorizableTree.end() && "Gather node is not in the graph."); - assert(I->get()->UserTreeIndices.size() == 1 && + auto *I = find_if( + VectorizableTree, [E, NodeIdx](const std::unique_ptr &TE) { + return TE->isOperandGatherNode({E, NodeIdx}) || + (TE->State == TreeEntry::SplitVectorize && + is_contained(TE->UserTreeIndices, EdgeInfo(E, NodeIdx))); + }); + assert(I != VectorizableTree.end() && + "Gather/split node node is not in the graph."); + assert((!I->get()->isGather() || I->get()->UserTreeIndices.size() == 1) && "Expected only single user for the gather node."); assert(I->get()->isSame(VL) && "Expected same list of scalars."); return vectorizeTree(I->get(), PostponedPHIs); @@ -15297,6 +15720,62 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { E->VectorizedValue = Vec; return Vec; } + if (E->State == TreeEntry::SplitVectorize) { + assert(E->CombinedEntriesWithIndices.size() == 2 && + "Expected exactly 2 combined entries."); + setInsertPointAfterBundle(E); + TreeEntry &OpTE1 = + *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get(); + assert(OpTE1.isSame( + ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && + "Expected same first part of scalars."); + Value *Op1 = vectorizeTree(&OpTE1, PostponedPHIs); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for node " << E->Idx << ".\n"; + E->dump()); + return E->VectorizedValue; + } + TreeEntry &OpTE2 = + *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get(); + assert( + OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) && + "Expected same second part of scalars."); + Value *Op2 = vectorizeTree(&OpTE2, PostponedPHIs); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for node " << E->Idx << ".\n"; + E->dump()); + return E->VectorizedValue; + } + if (E->ReorderIndices.empty()) { + SmallVector Mask(E->getVectorFactor(), PoisonMaskElem); + std::iota( + Mask.begin(), + std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second), + 0); + Value *Vec = Builder.CreateShuffleVector(Op1, Mask); + Vec = createInsertVector(Builder, Vec, Op2, + E->CombinedEntriesWithIndices.back().second); + E->VectorizedValue = Vec; + return Vec; + } + unsigned CommonVF = + std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor()); + if (getNumElements(Op1->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()), + 0); + Op1 = Builder.CreateShuffleVector(Op1, Mask); + } + if (getNumElements(Op2->getType()) != CommonVF) { + SmallVector Mask(CommonVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()), + 0); + Op2 = Builder.CreateShuffleVector(Op2, Mask); + } + Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask()); + E->VectorizedValue = Vec; + return Vec; + } bool IsReverseOrder = !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); @@ -16347,7 +16826,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, for (const TreeEntry *E : PostponedNodes) { auto *TE = const_cast(E); if (auto *VecTE = getTreeEntry(TE->Scalars.front())) - if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand( + if (TE->UserTreeIndices.front().UserTE->State != + TreeEntry::SplitVectorize && + VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand( TE->UserTreeIndices.front().EdgeIdx)) && VecTE->isSame(TE->Scalars)) // Found gather node which is absolutely the same as one of the @@ -16850,7 +17331,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather()) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -16907,6 +17388,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, return EI.UserTE == VectorizableTree.front().get() && EI.EdgeIdx == UINT_MAX; }))) && + !(VectorizableTree.front()->State == TreeEntry::SplitVectorize && + !IE->UserTreeIndices.empty() && + is_contained(VectorizableTree.front()->Scalars, I)) && !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && @@ -17927,6 +18411,13 @@ bool BoUpSLP::collectValuesToDemote( ToDemote.push_back(E.Idx); return IsProfitableToDemote; }; + + if (E.State == TreeEntry::SplitVectorize) + return TryProcessInstruction( + BitWidth, + {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), + VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 1cf20cb1fd64d..878f59c460c2f 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -31,82 +31,108 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[RDD_PTR_2]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[RDD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16) -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] -; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], splat (i32 15) -; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], splat (i32 65537) -; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], splat (i32 65535) -; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] -; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] -; CHECK-NEXT: [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = shl nsw <4 x i32> [[TMP9]], splat (i32 16) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = sub nsw <4 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <4 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = sub nsw <4 x i32> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = shl nsw <4 x i32> [[TMP29]], splat (i32 16) +; CHECK-NEXT: [[TMP31:%.*]] = add nsw <4 x i32> [[TMP30]], [[TMP24]] +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = sub nsw <4 x i32> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = sub nsw <4 x i32> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP41:%.*]] = zext <4 x i8> [[TMP40]] to <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = sub nsw <4 x i32> [[TMP41]], [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP46:%.*]] = zext <4 x i8> [[TMP45]] to <4 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <4 x i32> [[TMP46]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <4 x i32> [[TMP49]], splat (i32 16) +; CHECK-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP50]], [[TMP44]] +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i32> [[TMP51]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <4 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <4 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP54]], <4 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP58]], <4 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP61:%.*]] = zext <4 x i8> [[TMP60]] to <4 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP63:%.*]] = zext <4 x i8> [[TMP62]] to <4 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = sub nsw <4 x i32> [[TMP61]], [[TMP63]] +; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP68:%.*]] = zext <4 x i8> [[TMP67]] to <4 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <4 x i32> [[TMP66]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = shl nsw <4 x i32> [[TMP69]], splat (i32 16) +; CHECK-NEXT: [[TMP71:%.*]] = add nsw <4 x i32> [[TMP70]], [[TMP64]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <4 x i32> [[TMP71]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = add nsw <4 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <4 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP73]], <4 x i32> [[TMP74]], <4 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <4 x i32> [[TMP75]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = add nsw <4 x i32> [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[TMP78:%.*]] = sub nsw <4 x i32> [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP77]], <4 x i32> [[TMP78]], <4 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP19]] +; CHECK-NEXT: [[TMP81:%.*]] = sub nsw <4 x i32> [[TMP19]], [[TMP39]] +; CHECK-NEXT: [[TMP82:%.*]] = add nsw <4 x i32> [[TMP79]], [[TMP59]] +; CHECK-NEXT: [[TMP83:%.*]] = sub nsw <4 x i32> [[TMP59]], [[TMP79]] +; CHECK-NEXT: [[TMP84:%.*]] = add nsw <4 x i32> [[TMP83]], [[TMP81]] +; CHECK-NEXT: [[TMP85:%.*]] = add nsw <4 x i32> [[TMP82]], [[TMP80]] +; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <8 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = sub nsw <4 x i32> [[TMP80]], [[TMP82]] +; CHECK-NEXT: [[TMP88:%.*]] = sub nsw <4 x i32> [[TMP81]], [[TMP83]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP88]], <8 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <16 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP88]], <16 x i32> +; CHECK-NEXT: [[TMP92:%.*]] = shufflevector <16 x i32> [[TMP90]], <16 x i32> [[TMP91]], <16 x i32> +; CHECK-NEXT: [[TMP93:%.*]] = lshr <16 x i32> [[TMP92]], splat (i32 15) +; CHECK-NEXT: [[TMP94:%.*]] = and <16 x i32> [[TMP93]], splat (i32 65537) +; CHECK-NEXT: [[TMP95:%.*]] = mul nuw <16 x i32> [[TMP94]], splat (i32 65535) +; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <16 x i32> [[TMP95]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP86]], <8 x i32> [[TMP89]], <16 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = add <16 x i32> [[TMP96]], [[TMP97]] +; CHECK-NEXT: [[TMP99:%.*]] = xor <16 x i32> [[TMP98]], [[TMP96]] +; CHECK-NEXT: [[TMP100:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP100]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP100]], 16 ; CHECK-NEXT: [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[RDD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll index 3ebe920d17343..cfa7d83224aa2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll @@ -5,7 +5,11 @@ define void @h() { ; CHECK-LABEL: define void @h() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[TMP2]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 9ce79e5ea356b..9dec8c219da5f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL]], 2 ; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 ; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 ; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4 ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24 @@ -715,25 +715,22 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28 ; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44 -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36 ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -1216,70 +1213,105 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = sub nsw <4 x i32> [[TMP20]], [[TMP22]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = sub nsw <4 x i32> [[TMP40]], [[TMP42]] +; CHECK-NEXT: [[TMP47:%.*]] = shl nsw <4 x i32> [[TMP45]], splat (i32 16) +; CHECK-NEXT: [[TMP67:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP25]] +; CHECK-NEXT: [[TMP92:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP92]] +; CHECK-NEXT: [[TMP14:%.*]] = sub nsw <4 x i32> [[TMP67]], [[TMP92]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <4 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP21]], [[TMP23]] ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = sub nsw <4 x i32> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = shl nsw <4 x i32> [[TMP29]], splat (i32 16) +; CHECK-NEXT: [[TMP31:%.*]] = add nsw <4 x i32> [[TMP30]], [[TMP24]] +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP94:%.*]] = sub nsw <4 x i32> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP94]], <4 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = sub nsw <4 x i32> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = sub nsw <4 x i32> [[TMP41]], [[TMP43]] ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP46:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <4 x i32> [[TMP46]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <4 x i32> [[TMP49]], splat (i32 16) +; CHECK-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP50]], [[TMP44]] +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i32> [[TMP51]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <4 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <4 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP54]], <4 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP95:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP58]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP98:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = sub nsw <4 x i32> [[TMP97]], [[TMP98]] ; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16) -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]] -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <4 x i32> [[TMP66]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = shl nsw <4 x i32> [[TMP69]], splat (i32 16) +; CHECK-NEXT: [[TMP71:%.*]] = add nsw <4 x i32> [[TMP70]], [[TMP99]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <4 x i32> [[TMP71]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = add nsw <4 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <4 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP73]], <4 x i32> [[TMP74]], <4 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <4 x i32> [[TMP75]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = add nsw <4 x i32> [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[TMP78:%.*]] = sub nsw <4 x i32> [[TMP75]], [[TMP76]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP77]], <4 x i32> [[TMP78]], <4 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP93]] +; CHECK-NEXT: [[TMP81:%.*]] = sub nsw <4 x i32> [[TMP93]], [[TMP39]] +; CHECK-NEXT: [[TMP82:%.*]] = add nsw <4 x i32> [[TMP79]], [[TMP95]] +; CHECK-NEXT: [[TMP83:%.*]] = sub nsw <4 x i32> [[TMP95]], [[TMP79]] +; CHECK-NEXT: [[TMP84:%.*]] = add nsw <4 x i32> [[TMP83]], [[TMP81]] +; CHECK-NEXT: [[TMP85:%.*]] = add nsw <4 x i32> [[TMP82]], [[TMP80]] +; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <8 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = sub nsw <4 x i32> [[TMP80]], [[TMP82]] +; CHECK-NEXT: [[TMP88:%.*]] = sub nsw <4 x i32> [[TMP81]], [[TMP83]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP88]], <8 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <16 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP88]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP90]], <16 x i32> [[TMP91]], <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], splat (i32 15) ; CHECK-NEXT: [[TMP61:%.*]] = and <16 x i32> [[TMP60]], splat (i32 65537) ; CHECK-NEXT: [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], splat (i32 65535) -; CHECK-NEXT: [[TMP63:%.*]] = add <16 x i32> [[TMP62]], [[TMP59]] -; CHECK-NEXT: [[TMP64:%.*]] = xor <16 x i32> [[TMP63]], [[TMP62]] +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <8 x i32> [[TMP86]], <8 x i32> [[TMP89]], <16 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = add <16 x i32> [[TMP100]], [[TMP96]] +; CHECK-NEXT: [[TMP64:%.*]] = xor <16 x i32> [[TMP101]], [[TMP100]] ; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP64]]) ; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP65]], 65535 ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP65]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index c431b058f0d2d..92027d0043f76 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,15 +17,12 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] ; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 257e4660c80aa..8dcb5112851b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -1,660 +1,276 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s +; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v | FileCheck %s ; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 | FileCheck %s --check-prefix=THR15 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[CONV1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]] -; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], splat (i32 16) -; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] -; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16) -; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]] -; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] -; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]] -; CHECK-NEXT: [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16) -; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]] -; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; CHECK-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] -; CHECK-NEXT: [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; CHECK-NEXT: [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32> -; CHECK-NEXT: [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] -; CHECK-NEXT: [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16) -; CHECK-NEXT: [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]] -; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 -; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 -; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]] -; CHECK-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]] -; CHECK-NEXT: [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0 -; CHECK-NEXT: [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1 -; CHECK-NEXT: [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]] -; CHECK-NEXT: [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0 -; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0 -; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] -; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15 -; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 -; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]] -; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP92]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP95]] to <4 x i32> ; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]] -; CHECK-NEXT: [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16) -; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> -; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> -; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]] -; CHECK-NEXT: [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16) -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]] -; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]] -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]] -; CHECK-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> -; CHECK-NEXT: [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]] -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0 -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]] -; CHECK-NEXT: [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0 -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]] -; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]] -; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15 -; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15 -; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> -; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32> -; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32> -; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]] -; CHECK-NEXT: [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16) -; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32> -; CHECK-NEXT: [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> -; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]] -; CHECK-NEXT: [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16) -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]] -; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]] -; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]] -; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]] -; CHECK-NEXT: [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]] -; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0 -; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]] -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]] -; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0 -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1 -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15 -; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15 -; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15) -; CHECK-NEXT: [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537) -; CHECK-NEXT: [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535) -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] -; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]] -; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]] -; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] -; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] -; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]] -; CHECK-NEXT: [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]] -; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]] -; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]] -; CHECK-NEXT: [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15 -; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]] -; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]] -; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1 -; CHECK-NEXT: [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]] -; CHECK-NEXT: [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]] -; CHECK-NEXT: [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]] -; CHECK-NEXT: [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]] -; CHECK-NEXT: [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0 -; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 -; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> -; CHECK-NEXT: [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]] -; CHECK-NEXT: [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]] -; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]] -; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15) -; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537) -; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535) -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]] -; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]] -; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]] -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]] -; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1 -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP98]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; CHECK-NEXT: [[TMP235:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; CHECK-NEXT: [[TMP194:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; CHECK-NEXT: [[TMP195:%.*]] = add <4 x i32> [[TMP194]], [[TMP235]] +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP195]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP195]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP195]], [[TMP34]] +; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <4 x i32> [[TMP227]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP228:%.*]] = add <4 x i32> [[TMP227]], [[TMP223]] +; CHECK-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP227]], [[TMP223]] +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP228]], <4 x i32> [[TMP40]], <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; CHECK-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2) +; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; CHECK-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP91]], i64 4) +; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; CHECK-NEXT: [[TMP119:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP119]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP87]], i64 4) +; CHECK-NEXT: [[TMP96:%.*]] = add <8 x i32> [[TMP90]], [[TMP94]] +; CHECK-NEXT: [[TMP121:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP91]], i64 0) +; CHECK-NEXT: [[TMP97:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP121]], <4 x i32> [[TMP88]], i64 4) +; CHECK-NEXT: [[TMP101:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP87]], i64 0) +; CHECK-NEXT: [[TMP99:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP101]], <4 x i32> [[TMP119]], i64 4) +; CHECK-NEXT: [[TMP100:%.*]] = sub <8 x i32> [[TMP97]], [[TMP99]] +; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <8 x i32> [[TMP96]], <8 x i32> [[TMP100]], <16 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <16 x i32> [[TMP103]], <16 x i32> [[TMP104]], <16 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <16 x i32> [[TMP105]], <16 x i32> [[TMP106]], <16 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <16 x i32> [[TMP107]], <16 x i32> [[TMP108]], <16 x i32> +; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <4 x i32> [[TMP227]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = shufflevector <16 x i32> [[TMP109]], <16 x i32> [[TMP110]], <16 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <16 x i32> [[TMP111]], <16 x i32> [[TMP112]], <16 x i32> +; CHECK-NEXT: [[TMP114:%.*]] = lshr <16 x i32> [[TMP113]], splat (i32 15) +; CHECK-NEXT: [[TMP115:%.*]] = and <16 x i32> [[TMP114]], splat (i32 65537) +; CHECK-NEXT: [[TMP116:%.*]] = mul <16 x i32> [[TMP115]], splat (i32 65535) +; CHECK-NEXT: [[TMP117:%.*]] = add <16 x i32> [[TMP116]], [[TMP102]] +; CHECK-NEXT: [[TMP118:%.*]] = xor <16 x i32> [[TMP117]], [[TMP113]] +; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]]) ; CHECK-NEXT: ret i32 [[ADD113_3]] ; ; THR15-LABEL: define i32 @test( ; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; THR15-NEXT: entry: -; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; THR15-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 ; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32 ; THR15-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; THR15-NEXT: [[TMP2:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; THR15-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP2]] to i32 ; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; THR15-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; THR15-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 +; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 +; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 +; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; THR15-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; THR15-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32> +; THR15-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; THR15-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; THR15-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; THR15-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32> +; THR15-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]] +; THR15-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +; THR15-NEXT: [[TMP47:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] +; THR15-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP47]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP47]], [[TMP14]] +; THR15-NEXT: [[TMP49:%.*]] = sub <4 x i32> [[TMP47]], [[TMP14]] +; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP49]], <4 x i32> +; THR15-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]] +; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; THR15-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 +; THR15-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; THR15-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; THR15-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]] +; THR15-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; THR15-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]] +; THR15-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16) +; THR15-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]] +; THR15-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP33]], [[TMP34]] +; THR15-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP33]], [[TMP34]] +; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> +; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]] +; THR15-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> ; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; THR15-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; THR15-NEXT: [[TMP87:%.*]] = zext i8 [[TMP6]] to i32 +; THR15-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]] +; THR15-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; THR15-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]] ; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; THR15-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32> +; THR15-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32> ; THR15-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; THR15-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> -; THR15-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]] -; THR15-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], splat (i32 16) -; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]] -; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32> -; THR15-NEXT: [[TMP86:%.*]] = zext i8 [[TMP7]] to i32 -; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> -; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]] -; THR15-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> -; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]] -; THR15-NEXT: [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], splat (i32 16) -; THR15-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]] -; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]] -; THR15-NEXT: [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]] -; THR15-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0 -; THR15-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1 -; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]] -; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0 -; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1 -; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]] -; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1 -; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32> -; THR15-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 -; THR15-NEXT: [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32> -; THR15-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]] +; THR15-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP16]] to <4 x i32> +; THR15-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]] +; THR15-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16) +; THR15-NEXT: [[TMP62:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]] +; THR15-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP62]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP62]], [[TMP54]] +; THR15-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP62]], [[TMP54]] +; THR15-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> +; THR15-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] +; THR15-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> ; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; THR15-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> -; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]] -; THR15-NEXT: [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], splat (i32 16) -; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]] -; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; THR15-NEXT: [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> -; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; THR15-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> -; THR15-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] -; THR15-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; THR15-NEXT: [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; THR15-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] -; THR15-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16) -; THR15-NEXT: [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] -; THR15-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 -; THR15-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 -; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] -; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0 -; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1 -; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; THR15-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] -; THR15-NEXT: [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0 -; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0 -; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]] -; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0 -; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0 -; THR15-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] -; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]] -; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]] -; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15 -; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15 -; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]] -; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]] -; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15 -; THR15-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; THR15-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; THR15-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15 -; THR15-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; THR15-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; THR15-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0 -; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1 -; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]] -; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; THR15-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]] -; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]] -; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15 -; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; THR15-NEXT: [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; THR15-NEXT: [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32> -; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; THR15-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; THR15-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> -; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> -; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]] -; THR15-NEXT: [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], splat (i32 16) -; THR15-NEXT: [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> -; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; THR15-NEXT: [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]] -; THR15-NEXT: [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], splat (i32 16) -; THR15-NEXT: [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1 -; THR15-NEXT: [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]] -; THR15-NEXT: [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]] -; THR15-NEXT: [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0 -; THR15-NEXT: [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]] -; THR15-NEXT: [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]] -; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> -; THR15-NEXT: [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]] -; THR15-NEXT: [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]] -; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0 -; THR15-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1 -; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]] -; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]] -; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0 -; THR15-NEXT: [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1 -; THR15-NEXT: [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]] -; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]] -; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15 -; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15 -; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; THR15-NEXT: [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; THR15-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32> -; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; THR15-NEXT: [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32> -; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; THR15-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; THR15-NEXT: [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32> -; THR15-NEXT: [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]] -; THR15-NEXT: [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], splat (i32 16) -; THR15-NEXT: [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32> -; THR15-NEXT: [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32> -; THR15-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> -; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]] -; THR15-NEXT: [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16) -; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1 -; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]] -; THR15-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]] -; THR15-NEXT: [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0 -; THR15-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]] -; THR15-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]] -; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]] -; THR15-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]] -; THR15-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0 -; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1 -; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]] -; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]] -; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 -; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1 -; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]] -; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]] -; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15 -; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15 -; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; THR15-NEXT: [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], splat (i32 15) -; THR15-NEXT: [[TMP157:%.*]] = and <2 x i32> [[TMP156]], splat (i32 65537) -; THR15-NEXT: [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], splat (i32 65535) -; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]] -; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]] -; THR15-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] -; THR15-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; THR15-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; THR15-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] -; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]] -; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]] -; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]] -; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]] -; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; THR15-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]] -; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] -; THR15-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] -; THR15-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] -; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; THR15-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; THR15-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]] -; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]] -; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]] -; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]] -; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] -; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] -; THR15-NEXT: [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; THR15-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 -; THR15-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> -; THR15-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]] -; THR15-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]] -; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] -; THR15-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; THR15-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]] -; THR15-NEXT: [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]] -; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15 -; THR15-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; THR15-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]] -; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; THR15-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0 -; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]] -; THR15-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1 -; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]] -; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]] -; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]] -; THR15-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; THR15-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; THR15-NEXT: [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> -; THR15-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] -; THR15-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] -; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]] -; THR15-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]] -; THR15-NEXT: [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], splat (i32 15) -; THR15-NEXT: [[TMP180:%.*]] = and <2 x i32> [[TMP179]], splat (i32 65537) -; THR15-NEXT: [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], splat (i32 65535) -; THR15-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]] -; THR15-NEXT: [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]] -; THR15-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; THR15-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; THR15-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0 -; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]] -; THR15-NEXT: [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1 -; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]] -; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; THR15-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> +; THR15-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> +; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] +; THR15-NEXT: [[TMP155:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 +; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; THR15-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP53]], i64 2) +; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; THR15-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; THR15-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> +; THR15-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] +; THR15-NEXT: [[TMP165:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) +; THR15-NEXT: [[TMP166:%.*]] = add <4 x i32> [[TMP165]], [[TMP155]] +; THR15-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP166]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP166]], [[TMP79]] +; THR15-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP166]], [[TMP79]] +; THR15-NEXT: [[TMP222:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> +; THR15-NEXT: [[TMP217:%.*]] = shufflevector <4 x i32> [[TMP222]], <4 x i32> poison, <4 x i32> +; THR15-NEXT: [[TMP223:%.*]] = add <4 x i32> [[TMP222]], [[TMP217]] +; THR15-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP222]], [[TMP217]] +; THR15-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP223]], <4 x i32> [[TMP85]], <4 x i32> +; THR15-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]] +; THR15-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]] +; THR15-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP91]], i64 4) +; THR15-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]] +; THR15-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]] +; THR15-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> +; THR15-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP87]], i64 4) +; THR15-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP90]], [[TMP94]] +; THR15-NEXT: [[TMP98:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP91]], i64 0) +; THR15-NEXT: [[TMP97:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP98]], <4 x i32> [[TMP88]], i64 4) +; THR15-NEXT: [[TMP101:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP87]], i64 0) +; THR15-NEXT: [[TMP99:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP101]], <4 x i32> [[TMP92]], i64 4) +; THR15-NEXT: [[TMP100:%.*]] = sub <8 x i32> [[TMP97]], [[TMP99]] +; THR15-NEXT: [[TMP102:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP100]], <16 x i32> +; THR15-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> +; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP105:%.*]] = shufflevector <16 x i32> [[TMP103]], <16 x i32> [[TMP104]], <16 x i32> +; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP107:%.*]] = shufflevector <16 x i32> [[TMP105]], <16 x i32> [[TMP106]], <16 x i32> +; THR15-NEXT: [[TMP108:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP109:%.*]] = shufflevector <16 x i32> [[TMP107]], <16 x i32> [[TMP108]], <16 x i32> +; THR15-NEXT: [[TMP110:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP111:%.*]] = shufflevector <16 x i32> [[TMP109]], <16 x i32> [[TMP110]], <16 x i32> +; THR15-NEXT: [[TMP112:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP113:%.*]] = shufflevector <16 x i32> [[TMP111]], <16 x i32> [[TMP112]], <16 x i32> +; THR15-NEXT: [[TMP114:%.*]] = lshr <16 x i32> [[TMP113]], splat (i32 15) +; THR15-NEXT: [[TMP115:%.*]] = and <16 x i32> [[TMP114]], splat (i32 65537) +; THR15-NEXT: [[TMP116:%.*]] = mul <16 x i32> [[TMP115]], splat (i32 65535) +; THR15-NEXT: [[TMP117:%.*]] = add <16 x i32> [[TMP116]], [[TMP102]] +; THR15-NEXT: [[TMP118:%.*]] = xor <16 x i32> [[TMP117]], [[TMP113]] +; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]]) ; THR15-NEXT: ret i32 [[ADD113_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 85131758853b3..3243579f11820 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1027,10 +1027,8 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index e24c52ba81ddf..b374e877bb38a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -1,16 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -42,9 +44,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -75,11 +79,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -126,9 +158,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { define <8 x i32> @sext_zext(<8 x i16> %a) { ; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 0f8751a6da7f5..ddd3dffaafcc5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -1,16 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -42,9 +44,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; CHECK-LABEL: @fptosi_fptoui( -; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -75,11 +79,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { } define <8 x float> @fneg_fabs(<8 x float> %a) { -; CHECK-LABEL: @fneg_fabs( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] +; SSE2-LABEL: @fneg_fabs( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; SLM-LABEL: @fneg_fabs( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]] +; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX-LABEL: @fneg_fabs( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX2-LABEL: @fneg_fabs( +; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] +; +; AVX512-LABEL: @fneg_fabs( +; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -126,9 +158,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { define <8 x i32> @sext_zext(<8 x i16> %a) { ; CHECK-LABEL: @sext_zext( -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll index 5a1de4f3e3d7f..4ba4ba13a5ffe 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,17 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +73,35 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll index 046ed781f4c8d..f29e546631b71 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,17 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fadd_fsub_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fadd_fsub_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fadd_fsub_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fadd_fsub_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fadd_fsub_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,11 +73,35 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP5]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP5]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP5]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index 8839fc2281788..f8c5df9944538 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index dfa918a6ea453..b84ef027f67c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -7,11 +7,39 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @add_sub_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @add_sub_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] +; +; SLM-LABEL: @add_sub_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX1-LABEL: @add_sub_v8i32( +; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @add_sub_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @add_sub_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R71]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], @@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { -; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; SSE-LABEL: @add_sub_v8i32_splat( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @add_sub_v8i32_splat( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX1-LABEL: @add_sub_v8i32_splat( +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX2-LABEL: @add_sub_v8i32_splat( +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP5]] +; +; AVX512-LABEL: @add_sub_v8i32_splat( +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index b659c10bb2fbf..7ed5f33c9dc6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -7,7 +7,7 @@ define void @test() { ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]] @@ -16,6 +16,8 @@ define void @test() { ; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll index aff66dd7c10ea..70c67ff251d6d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll @@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> ; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index cfbfd0ebc37bc..7b26da936cb1b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -10,22 +10,25 @@ define i32 @bar() local_unnamed_addr { ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[SUB102_1]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> , <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[SUB102_1]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[SUB86_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8) ; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15) ; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537) ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535) -; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP20:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP19]], <8 x i32> [[TMP10]], i64 8) +; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]] ; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]]) ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll index 2f49a2e6a212e..e9a65bf6d6f0d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll @@ -6,11 +6,11 @@ define i1 @foo() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0 ; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0 -; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL_NOT_NOT509_I_1_2_I_I]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> [[TMP2]], <2 x i1> zeroinitializer, i64 6) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[STOREMERGE_2333_I_I]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index c01c44ff03c15..1294a87ff6967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -7,20 +7,14 @@ define void @test(i1 %c, ptr %arg) { ; CHECK: if: ; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll index 33fa00c1881da..8ce3e62519b6d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll @@ -6,23 +6,19 @@ define i32 @a() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]] -; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4 -; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4 ; CHECK-NEXT: br label %[[BB1]] ; br label %1 diff --git a/llvm/test/Transforms/SLPVectorizer/addsub.ll b/llvm/test/Transforms/SLPVectorizer/addsub.ll index 3961250d56451..6814bc0f566f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/addsub.ll @@ -387,14 +387,10 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 056b6222cae72..a900b7a3afbc5 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -4,24 +4,27 @@ define void @func(i32 %0) { ; CHECK-LABEL: define void @func( ; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 0, 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 0 to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <32 x i32> , i32 [[TMP11]], i32 30 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16) -; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24) -; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14) -; CHECK-NEXT: [[TMP17:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP16]], <2 x i32> zeroinitializer, i64 28) -; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i32> [[TMP8]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <30 x i64> poison, i64 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <30 x i64> [[TMP77]], i64 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <30 x i64> [[TMP80]], <30 x i64> poison, <30 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <30 x i64> [[TMP11]], <30 x i64> poison, <32 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <32 x i64> @llvm.vector.insert.v32i64.v2i64(<32 x i64> [[TMP12]], <2 x i64> zeroinitializer, i64 30) +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <32 x i64> , i64 [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <32 x i64> [[TMP14]], i64 [[TMP9]], i32 30 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <32 x i64> [[TMP15]], <32 x i64> poison, <32 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = call <32 x i64> @llvm.vector.insert.v32i64.v8i64(<32 x i64> [[TMP16]], <8 x i64> zeroinitializer, i64 16) +; CHECK-NEXT: [[TMP18:%.*]] = call <32 x i64> @llvm.vector.insert.v32i64.v4i64(<32 x i64> [[TMP17]], <4 x i64> zeroinitializer, i64 24) +; CHECK-NEXT: [[TMP81:%.*]] = call <32 x i64> @llvm.vector.insert.v32i64.v2i64(<32 x i64> [[TMP18]], <2 x i64> zeroinitializer, i64 14) +; CHECK-NEXT: [[TMP82:%.*]] = call <32 x i64> @llvm.vector.insert.v32i64.v2i64(<32 x i64> [[TMP81]], <2 x i64> zeroinitializer, i64 28) +; CHECK-NEXT: [[TMP19:%.*]] = or <32 x i64> [[TMP13]], [[TMP82]] +; CHECK-NEXT: [[TMP78:%.*]] = or i64 [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31 ; CHECK-NEXT: [[TMP22:%.*]] = and i1 false, [[TMP21]] @@ -79,8 +82,6 @@ define void @func(i32 %0) { ; CHECK-NEXT: [[TMP74:%.*]] = and i1 false, [[TMP73]] ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4 ; CHECK-NEXT: [[TMP76:%.*]] = and i1 false, [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP78:%.*]] = zext i32 [[TMP77]] to i64 ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]] ; CHECK-NEXT: ret void ; From ba03acd7f6b039b580a70c36b8cbf37d717d4eac Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 17 Jan 2025 16:15:44 +0000 Subject: [PATCH 2/2] Fix formatting Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 59063d6b4c9bc..cf854213a1b24 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14171,7 +14171,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { } assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || - E->State == TreeEntry::SplitVectorize || + E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() &&