From 9e240ac5954cfb7700cf0aafd4b0600dcac9f590 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 6 Nov 2024 19:52:20 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 173 ++++++- .../AArch64/reused-scalar-repeated-in-node.ll | 8 +- .../SLPVectorizer/RISCV/complex-loads.ll | 462 +++++++++--------- .../X86/scatter-vectorize-reorder.ll | 2 +- .../alternate-cmp-swapped-pred-parent.ll | 6 +- .../extract-many-users-buildvector.ll | 75 ++- ...hered-consecutive-loads-different-types.ll | 10 +- .../SLPVectorizer/reorder-clustered-node.ll | 72 ++- .../resized-alt-shuffle-after-minbw.ll | 6 +- 9 files changed, 509 insertions(+), 305 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 53e514766fee8..48419699f9cd5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4846,8 +4846,21 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, VectorType *SubTp = nullptr, ArrayRef Args = {}) { - if (Kind != TTI::SK_PermuteTwoSrc) + if (Kind != TTI::SK_PermuteTwoSrc) { + int SplatIdx = PoisonMaskElem; + if (!Mask.empty() && all_of(Mask, [&](int Idx) { + if (Idx == PoisonMaskElem) + return true; + if (SplatIdx == PoisonMaskElem) { + SplatIdx = Idx; + return true; + } + return SplatIdx == Idx; + })) + return TTI.getShuffleCost(TTI::SK_Broadcast, Tp, Mask, CostKind, Index, + SubTp, Args); return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + } int NumSrcElts = Tp->getElementCount().getKnownMinValue(); int NumSubElts; if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( @@ -10257,10 +10270,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Idx = EMask[Idx]; } CommonVF = E->Scalars.size(); - } else if (std::optional Factor = E->getInterleaveFactor(); - Factor && E->Scalars.size() != Mask.size() && + } else if (unsigned Factor = E->getInterleaveFactor(); + Factor > 0 && E->Scalars.size() != Mask.size() && ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask, - *Factor)) { + Factor)) { // Deinterleaved nodes are free. std::iota(CommonMask.begin(), CommonMask.end(), 0); } @@ -12935,6 +12948,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // No perfect match, just shuffle, so choose the first tree node from the // tree. Entries.push_back(FirstEntries.front()); + VF = FirstEntries.front()->getVectorFactor(); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); @@ -12975,6 +12989,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.push_back(SecondEntries.front()); VF = std::max(Entries.front()->getVectorFactor(), Entries.back()->getVectorFactor()); + } else { + VF = Entries.front()->getVectorFactor(); } } @@ -13077,26 +13093,149 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // Pair.first is the offset to the vector, while Pair.second is the index of // scalar in the list. for (const std::pair &Pair : EntryLanes) { - unsigned Idx = Part * VL.size() + Pair.second; + int Idx = Part * VL.size() + Pair.second; Mask[Idx] = Pair.first * VF + (ForOrder ? std::distance( Entries[Pair.first]->Scalars.begin(), find(Entries[Pair.first]->Scalars, VL[Pair.second])) : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); - IsIdentity &= Mask[Idx] == Pair.second; + IsIdentity &= Mask[Idx] % VL.size() == Idx % VL.size(); } - switch (Entries.size()) { - case 1: - if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteSingleSrc; - break; - case 2: - if (EntryLanes.size() > 2 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteTwoSrc; - break; - default: - break; + if (ForOrder || IsIdentity || Entries.empty()) { + switch (Entries.size()) { + case 1: + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; + case 2: + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; + default: + break; + } + } else if (!isa(VL.front()->getType()) && + (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { + // Do the cost estimation if shuffle beneficial than buildvector. + SmallVector SubMask(std::next(Mask.begin(), Part * VL.size()), + std::next(Mask.begin(), (Part + 1) * VL.size())); + int MinElement = SubMask.front(), MaxElement = SubMask.front(); + for (int Idx : SubMask) { + if (Idx == PoisonMaskElem) + continue; + if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF) + MinElement = Idx; + if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF) + MaxElement = Idx; + } + assert(MaxElement >= 0 && MinElement >= 0 && + "Expected at least single element."); + unsigned NewVF = std::max( + VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(), + (MaxElement % VF) - + (MinElement % VF) + 1)); + if (NewVF < VF) { + for_each(SubMask, [&](int &Idx) { + if (Idx == PoisonMaskElem) + return; + Idx = (Idx % VF) - (MinElement % VF) + + (Idx >= static_cast(VF) ? NewVF : 0); + }); + VF = NewVF; + } + + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *VecTy = getWidenedType(VL.front()->getType(), VF); + auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); + auto GetShuffleCost = [&, + &TTI = *TTI](ArrayRef Mask, + ArrayRef Entries, + VectorType *VecTy) -> InstructionCost { + if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Mask, Entries.front()->getInterleaveFactor())) + return TTI::TCC_Free; + return ::getShuffleCost(TTI, + Entries.size() > 1 ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + }; + InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy); + InstructionCost FirstShuffleCost = 0; + SmallVector FirstMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[0]->isGather()) { + FirstShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(FirstMask)) { + if (Idx >= static_cast(VF)) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) + IsIdentity &= static_cast(I) == Idx; + } + } + if (!IsIdentity) + FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy); + FirstShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + InstructionCost SecondShuffleCost = 0; + SmallVector SecondMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[1]->isGather()) { + SecondShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(SecondMask)) { + if (Idx < static_cast(VF) && Idx >= 0) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) { + Idx -= VF; + IsIdentity &= static_cast(I) == Idx; + } + } + } + if (!IsIdentity) + SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy); + SecondShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx == PoisonMaskElem) + DemandedElts.clearBit(I); + InstructionCost BuildVectorCost = + TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + const TreeEntry *BestEntry = nullptr; + if (FirstShuffleCost < ShuffleCost) { + copy(FirstMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries.front(); + ShuffleCost = FirstShuffleCost; + } + if (SecondShuffleCost < ShuffleCost) { + copy(SecondMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries[1]; + ShuffleCost = SecondShuffleCost; + } + if (BuildVectorCost >= ShuffleCost) { + if (BestEntry) { + Entries.clear(); + Entries.push_back(BestEntry); + } + return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc + : TargetTransformInfo::SK_PermuteSingleSrc; + } } Entries.clear(); // Clear the corresponding mask elements. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index d6073ea4bbbae..96ff73f117a73 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -46,12 +46,12 @@ define void @test() { ; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x float> [ poison, %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP17]], [[TMP13]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP14]], <2 x float> [[TMP0]], i64 2) ; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 912d60d0cc386..f57e5de07807e 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -28,13 +28,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> @@ -50,7 +46,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32 ; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] @@ -61,246 +56,203 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16) ; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]] -; CHECK-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] ; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr i8, ptr null, i64 3 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr i8, ptr null, i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1 ; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP60:%.*]] = load <4 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <4 x i8> [[TMP60]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP52]], i32 1 +; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32> ; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) ; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> ; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] ; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i8> [[TMP48]], <2 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <2 x i8> [[TMP83]], i8 [[TMP33]], i32 0 ; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]] -; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], splat (i32 16) -; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]] -; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; CHECK-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[TMP81:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]] +; CHECK-NEXT: [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16) +; CHECK-NEXT: [[TMP168:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]] +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <4 x i8> [[TMP60]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i8> [[TMP53]], i8 [[TMP34]], i32 0 ; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> ; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] -; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]] -; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16) -; CHECK-NEXT: [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1 -; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]] -; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1 -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] -; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0 -; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]] -; CHECK-NEXT: [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]] -; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0 -; CHECK-NEXT: [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0 -; CHECK-NEXT: [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]] -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15 +; CHECK-NEXT: [[TMP169:%.*]] = shufflevector <2 x i8> [[TMP53]], <2 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i8> [[TMP169]], i8 [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32> +; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i8> [[TMP48]], i8 [[TMP47]], i32 0 +; CHECK-NEXT: [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] +; CHECK-NEXT: [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16) +; CHECK-NEXT: [[TMP68:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]] +; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP168]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP68]], <2 x i32> [[TMP42]], <2 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP152:%.*]] = add <2 x i32> [[TMP70]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <2 x i32> [[TMP68]], <2 x i32> [[TMP42]], <2 x i32> +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP73]], [[TMP74]] +; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1 +; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 +; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]] +; CHECK-NEXT: [[TMP220:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0 +; CHECK-NEXT: [[TMP221:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 +; CHECK-NEXT: [[SUB59_1:%.*]] = add i32 [[TMP221]], [[TMP220]] +; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP152]], [[TMP75]] +; CHECK-NEXT: [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0 +; CHECK-NEXT: [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1 +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] +; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP69]], <2 x i32> [[TMP44]], <2 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP69]], <2 x i32> [[TMP44]], <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP223]], [[TMP84]] +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]] +; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]] +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 +; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP197]], 15 +; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 +; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1 +; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 +; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] +; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15 ; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]] -; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15 -; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 -; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15 -; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0 -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]] -; CHECK-NEXT: [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]] -; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 -; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]] -; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 -; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 -; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; CHECK-NEXT: [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> -; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]] -; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], splat (i32 16) -; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32> -; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]] -; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], splat (i32 16) -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]] -; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]] -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]] -; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]] -; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] -; CHECK-NEXT: [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] -; CHECK-NEXT: [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0 -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]] -; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0 -; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]] -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]] -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15 -; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15 -; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537 -; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 -; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; CHECK-NEXT: [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> -; CHECK-NEXT: [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32> -; CHECK-NEXT: [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> -; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]] -; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], splat (i32 16) -; CHECK-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; CHECK-NEXT: [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> -; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]] -; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16) -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]] -; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]] -; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]] -; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]] -; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 -; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]] -; CHECK-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]] -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0 -; CHECK-NEXT: [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]] -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15 +; CHECK-NEXT: [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32> +; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> +; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]] +; CHECK-NEXT: [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16) +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]] +; CHECK-NEXT: [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16) +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 +; CHECK-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]] +; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]] +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 +; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]] +; CHECK-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> +; CHECK-NEXT: [[TMP118:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]] +; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]] +; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0 +; CHECK-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1 +; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP121]], [[TMP120]] +; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> [[TMP44]], <2 x i32> +; CHECK-NEXT: [[TMP123:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> [[TMP44]], <2 x i32> +; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP122]], [[TMP123]] +; CHECK-NEXT: [[TMP125:%.*]] = add <2 x i32> [[TMP122]], [[TMP123]] +; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> [[TMP125]], <2 x i32> +; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD113_2]], [[TMP127]] +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0 +; CHECK-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1 +; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]] +; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP121]], 15 +; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 +; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 +; CHECK-NEXT: [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> +; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> +; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32> +; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32> +; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]] +; CHECK-NEXT: [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16) +; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32> +; CHECK-NEXT: [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> +; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> +; CHECK-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]] +; CHECK-NEXT: [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16) +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1 +; CHECK-NEXT: [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]] +; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]] +; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]] +; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]] +; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]] +; CHECK-NEXT: [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]] +; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0 +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1 +; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]] +; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]] +; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0 +; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1 +; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15 +; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15 ; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP154:%.*]] = lshr <2 x i32> [[TMP110]], splat (i32 15) -; CHECK-NEXT: [[TMP184:%.*]] = and <2 x i32> [[TMP154]], splat (i32 65537) -; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP184]], splat (i32 65535) -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] +; CHECK-NEXT: [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15) +; CHECK-NEXT: [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537) +; CHECK-NEXT: [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535) +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] ; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]] +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]] +; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP197]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP121]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] ; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]] -; CHECK-NEXT: [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]] -; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] -; CHECK-NEXT: [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] -; CHECK-NEXT: [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 -; CHECK-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]] -; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]] -; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> -; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] -; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]] -; CHECK-NEXT: [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15 -; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; CHECK-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]] -; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]] +; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] +; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0 +; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]] ; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 ; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 @@ -308,21 +260,83 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] ; CHECK-NEXT: [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] ; CHECK-NEXT: [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> -; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] +; CHECK-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> [[TMP119]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP175:%.*]] = lshr <2 x i32> [[TMP174]], splat (i32 15) +; CHECK-NEXT: [[TMP226:%.*]] = and <2 x i32> [[TMP175]], splat (i32 65537) +; CHECK-NEXT: [[TMP227:%.*]] = mul <2 x i32> [[TMP226]], splat (i32 65535) +; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP229:%.*]] = insertelement <2 x i32> [[TMP228]], i32 [[ADD55]], i32 1 +; CHECK-NEXT: [[TMP230:%.*]] = insertelement <2 x i32> [[TMP222]], i32 [[ADD55_1]], i32 1 +; CHECK-NEXT: [[TMP231:%.*]] = sub <2 x i32> [[TMP229]], [[TMP230]] +; CHECK-NEXT: [[TMP232:%.*]] = insertelement <2 x i32> poison, i32 [[SUB51_1]], i32 0 +; CHECK-NEXT: [[TMP233:%.*]] = insertelement <2 x i32> [[TMP232]], i32 [[ADD113_2]], i32 1 +; CHECK-NEXT: [[TMP184:%.*]] = sub <2 x i32> [[TMP126]], [[TMP233]] +; CHECK-NEXT: [[TMP234:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP235:%.*]] = zext <2 x i8> [[TMP234]] to <2 x i32> +; CHECK-NEXT: [[TMP236:%.*]] = lshr <2 x i32> [[TMP235]], splat (i32 15) +; CHECK-NEXT: [[TMP237:%.*]] = and <2 x i32> [[TMP236]], splat (i32 65537) +; CHECK-NEXT: [[TMP238:%.*]] = mul <2 x i32> [[TMP237]], splat (i32 65535) +; CHECK-NEXT: [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP184]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP191:%.*]] = insertelement <2 x i32> [[TMP239]], i32 [[ADD94_1]], i32 1 +; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP231]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP240:%.*]] = insertelement <2 x i32> [[TMP192]], i32 [[ADD78_1]], i32 1 +; CHECK-NEXT: [[TMP194:%.*]] = add <2 x i32> [[TMP191]], [[TMP240]] +; CHECK-NEXT: [[TMP195:%.*]] = add <2 x i32> [[TMP238]], [[TMP194]] +; CHECK-NEXT: [[TMP196:%.*]] = xor <2 x i32> [[TMP195]], [[TMP235]] +; CHECK-NEXT: [[MUL_I51_4:%.*]] = extractelement <2 x i32> [[TMP196]], i32 0 ; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]] +; CHECK-NEXT: [[TMP198:%.*]] = extractelement <2 x i32> [[TMP196]], i32 1 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD_I52_3]], [[TMP198]] +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] +; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP231]], [[TMP184]] +; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP231]], [[TMP184]] +; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> +; CHECK-NEXT: [[TMP202:%.*]] = add <2 x i32> [[TMP227]], [[TMP201]] +; CHECK-NEXT: [[TMP203:%.*]] = xor <2 x i32> [[TMP202]], [[TMP174]] +; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP203]], i32 1 +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]] +; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP231]], i32 0 +; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0 +; CHECK-NEXT: [[TMP207:%.*]] = shufflevector <2 x i32> [[TMP184]], <2 x i32> [[TMP231]], <2 x i32> +; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP206]], [[TMP205]] +; CHECK-NEXT: [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP183]] +; CHECK-NEXT: [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]] +; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15 +; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 +; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 +; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]] +; CHECK-NEXT: [[TMP210:%.*]] = extractelement <2 x i32> [[TMP203]], i32 0 +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP210]], [[ADD113_1]] +; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]] +; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1 +; CHECK-NEXT: [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]] +; CHECK-NEXT: [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]] +; CHECK-NEXT: [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]] +; CHECK-NEXT: [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]] +; CHECK-NEXT: [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0 +; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 +; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]] +; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]] +; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> +; CHECK-NEXT: [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]] +; CHECK-NEXT: [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]] +; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]] ; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15) ; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537) ; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535) -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]] +; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]] ; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]] ; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 ; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 ; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 ; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] ; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]] ; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0 ; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]] ; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index 360b258f216c5..f875d45db61dd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -14,7 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll index 371b23019498d..afca39ad8938a 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll @@ -12,7 +12,8 @@ define void @test() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 5 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 6 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret void ; @@ -43,7 +44,8 @@ define void @test1() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL37]], i32 4 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> , i16 [[CALL]], i32 6 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL37]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll index 261ec2b3935d7..40568f9c8a509 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll @@ -1,31 +1,56 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %} define i1 @test(float %0, double %1) { -; CHECK-LABEL: define i1 @test -; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) -; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> -; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) -; CHECK-NEXT: ret i1 [[TMP22]] +; X86-LABEL: define i1 @test +; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { +; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 +; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> +; X86-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; X86-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; X86-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; X86-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; X86-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) +; X86-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; X86-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) +; X86-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] +; X86-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] +; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> +; X86-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> +; X86-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer +; X86-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer +; X86-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] +; X86-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) +; X86-NEXT: ret i1 [[TMP22]] +; +; AARCH64-LABEL: define i1 @test +; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { +; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 +; AARCH64-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; AARCH64-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> +; AARCH64-NEXT: [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]] +; AARCH64-NEXT: [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; AARCH64-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; AARCH64-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP12]], i64 0) +; AARCH64-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4) +; AARCH64-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]] +; AARCH64-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]] +; AARCH64-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> +; AARCH64-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float> +; AARCH64-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer +; AARCH64-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer +; AARCH64-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]] +; AARCH64-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]]) +; AARCH64-NEXT: ret i1 [[TMP23]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll index a854c61db6d28..a42c8f2c650ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll +++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll @@ -11,8 +11,8 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load volatile i8, ptr null, align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> , <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i48> , i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0 @@ -21,9 +21,9 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2 ; CHECK-NEXT: [[TMP13:%.*]] = load volatile i8, ptr null, align 2 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> , <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> , i8 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> , i8 [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]] diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll index 561182d5e4f49..940ee5b95871d 100644 --- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll @@ -1,30 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s --check-prefix X86 %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s --check-prefix AARCH64 %} define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) { -; CHECK-LABEL: @test( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 -; CHECK-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] -; CHECK-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] -; CHECK-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] -; CHECK-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; X86-LABEL: @test( +; X86-NEXT: bb: +; X86-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 +; X86-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] +; X86-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] +; X86-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] +; X86-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] +; X86-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 +; X86-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> +; X86-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 +; X86-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 +; X86-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] +; X86-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0 +; X86-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1 +; X86-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2 +; X86-NEXT: [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3 +; X86-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> +; X86-NEXT: [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; X86-NEXT: [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]] +; X86-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]] +; X86-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]]) +; X86-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false +; X86-NEXT: ret i1 [[OP_RDX]] +; +; AARCH64-LABEL: @test( +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 +; AARCH64-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] +; AARCH64-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 +; AARCH64-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] +; AARCH64-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> +; AARCH64-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 +; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] +; AARCH64-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] +; AARCH64-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) +; AARCH64-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false +; AARCH64-NEXT: ret i1 [[OP_RDX]] ; bb: %i226 = getelementptr ptr, ptr %arg, i32 7 diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 61a84a67c9ff1..056b6222cae72 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -13,9 +13,9 @@ define void @func(i32 %0) { ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> , <32 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <32 x i32> , i32 [[TMP11]], i32 30 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16) ; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24) ; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)