From 36043047cd71aa5ae0b85c762e3836ac6af95296 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 5 Sep 2023 06:46:21 -0700 Subject: [PATCH 1/2] [SLP]Do not early exit if the number of unique elements is non-power-of-2. We still can try to vectorize the bundle of the instructions, even if the repeated number of instruction is non-power-of-2. In this case need to adjust the cost (calculate the cost only for unique scalar instructions) and cost of the extracts. Also, when scheduling the bundle need to schedule only unique scalars to avoid compiler crash because of the multiple dependencies. Can be safely applied only if all scalars's users are also vectorized and do not require memory accesses (this one is a temporarily requirement, can be relaxed later). --- .../Transforms/Vectorize/SLPVectorizer.cpp | 81 +++++++++++++------ .../AArch64/transpose-inseltpoison.ll | 29 +++---- .../SLPVectorizer/AArch64/transpose.ll | 29 +++---- .../vectorizable-selects-uniform-cmps.ll | 51 +++++------- llvm/test/Transforms/SLPVectorizer/X86/cse.ll | 13 +-- .../Transforms/SLPVectorizer/X86/pr49081.ll | 12 +-- 6 files changed, 110 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d2570fdea4216..09a12ae44ac94 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2911,7 +2911,8 @@ class BoUpSLP { } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); + [[maybe_unused]] const TreeEntry *TE = getTreeEntry(V); + assert((!TE || TE == Last) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -2924,7 +2925,8 @@ class BoUpSLP { for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; - assert(BundleMember && "Unexpected end of bundle."); + if (!BundleMember) + continue; BundleMember->TE = Last; BundleMember = BundleMember->NextInBundle; } @@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, SmallVector ReuseShuffleIndicies; SmallVector UniqueValues; - auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, - &UserTreeIdx, - this](const InstructionsState &S) { + SmallVector NonUniqueValueVL; + auto TryToFindDuplicates = [&](const InstructionsState &S, + bool DoNotFail = false) { // Check that every instruction appears once in this bundle. DenseMap UniquePositions(VL.size()); for (Value *V : VL) { @@ -5612,6 +5614,26 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, !isConstant(V); })) || !llvm::has_single_bit(NumUniqueScalarValues)) { + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + if (DoNotFail && UniquePositions.size() > 1 && + NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && + all_of(UniqueValues, [=](Value *V) { + return isa(V) || + areAllUsersVectorized(cast(V), IgnoredVals); + })) { + unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); + if (PWSz == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); + NonUniqueValueVL.append(PWSz - UniqueValues.size(), + UniqueValues.back()); + VL = NonUniqueValueVL; + } + return true; + } LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; @@ -5857,7 +5879,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - if (!TryToFindDuplicates(S)) + if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; // Perform specific checks for each particular instruction kind. @@ -5877,7 +5899,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BlockScheduling &BS = *BSRef; - std::optional Bundle = BS.tryScheduleBundle(VL, this, S); + std::optional Bundle = + BS.tryScheduleBundle(UniqueValues, this, S); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -7537,7 +7560,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); - const unsigned Sz = VL.size(); + SetVector UniqueValues(VL.begin(), VL.end()); + const unsigned Sz = UniqueValues.size(); auto GetCostDiff = [=](function_ref ScalarEltCost, function_ref VectorCost) { @@ -7644,7 +7668,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Count reused scalars. InstructionCost ScalarCost = 0; SmallPtrSet CountedOps; - for (Value *V : VL) { + for (Value *V : UniqueValues) { auto *PHI = dyn_cast(V); if (!PHI) continue; @@ -7665,8 +7689,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } case Instruction::ExtractValue: case Instruction::ExtractElement: { - auto GetScalarCost = [=](unsigned Idx) { - auto *I = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *I = cast(UniqueValues[Idx]); VectorType *SrcVecTy; if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(I); @@ -7844,9 +7868,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; } } - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = - VL0->getOpcode() == Opcode ? cast(VL[Idx]) : nullptr; + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = VL0->getOpcode() == Opcode + ? cast(UniqueValues[Idx]) + : nullptr; return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy, TTI::getCastContextHint(VI), CostKind, VI); }; @@ -7891,7 +7916,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; auto GetScalarCost = [&](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto *VI = cast(UniqueValues[Idx]); CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; @@ -7951,8 +7976,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); unsigned OpIdx = isa(VI) ? 0 : 1; TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); TTI::OperandValueInfo Op2Info = @@ -7975,14 +8000,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return CommonCost + GetGEPCostDiff(VL, VL0); } case Instruction::Load: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), VI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), VI); }; auto *LI0 = cast(VL0); - auto GetVectorCost = [=](InstructionCost CommonCost) { + auto GetVectorCost = [&](InstructionCost CommonCost) { InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { VecLdCost = TTI->getMemoryOpCost( @@ -7993,7 +8018,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, E->State == TreeEntry::PossibleStridedVectorize) && "Unknown EntryState"); Align CommonAlignment = LI0->getAlign(); - for (Value *V : VL) + for (Value *V : UniqueValues) CommonAlignment = std::min(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( @@ -8045,8 +8070,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); } case Instruction::Call: { - auto GetScalarCost = [=](unsigned Idx) { - auto *CI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *CI = cast(UniqueValues[Idx]); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID != Intrinsic::not_intrinsic) { IntrinsicCostAttributes CostAttrs(ID, *CI, 1); @@ -8087,8 +8112,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } return false; }; - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); (void)E; return TTI->getInstructionCost(VI, CostKind); @@ -8607,6 +8632,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallVector>> ShuffleMasks; SmallVector> FirstUsers; SmallVector DemandedElts; + SmallDenseSet UsedInserts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -8627,6 +8653,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null(EU.User)) { if (auto *FTy = dyn_cast(VU->getType())) { + if (!UsedInserts.insert(VU).second) + continue; std::optional InsertIdx = getInsertIndex(VU); if (InsertIdx) { const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); @@ -11008,6 +11036,7 @@ Value *BoUpSLP::vectorizeTree( // Maps extract Scalar to the corresponding extractelement instruction in the // basic block. Only one extractelement per block should be emitted. DenseMap> ScalarToEEs; + SmallDenseSet UsedInserts; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -11106,6 +11135,8 @@ Value *BoUpSLP::vectorizeTree( // Skip if the scalar is another vector op or Vec is not an instruction. if (!Scalar->getType()->isVectorTy() && isa(Vec)) { if (auto *FTy = dyn_cast(User->getType())) { + if (!UsedInserts.insert(VU).second) + continue; std::optional InsertIdx = getInsertIndex(VU); if (InsertIdx) { // Need to use original vector, if the root is truncated. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index dd65f126021bb..26d3a405019bf 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: ret <4 x i32> [[TMP11]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 58444a257aa66..bbf56b9a86ce1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: ret <4 x i32> [[TMP11]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index 69d0e6241b9c0..b59659ca75eb2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -242,40 +242,31 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i8> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i8> [[TMP0]], <8 x i8> [[TMP3]] -; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr [[PTR]], align 2 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP5]], i8 [[X]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0 -; CHECK-NEXT: store i8 [[TMP6]], ptr [[GEP_8]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[GEP_9]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i8> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP8]], <4 x i8> [[TMP7]], <4 x i8> [[TMP10]] -; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[GEP_9]], align 2 -; CHECK-NEXT: [[GEP_13:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 13 -; CHECK-NEXT: [[L_13:%.*]] = load i8, ptr [[GEP_13]], align 1 -; CHECK-NEXT: [[CMP_13:%.*]] = icmp ugt i8 [[L_13]], -1 -; CHECK-NEXT: [[S_13:%.*]] = select i1 [[CMP_13]], i8 [[L_13]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_13]], ptr [[GEP_13]], align 2 -; CHECK-NEXT: [[GEP_14:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 14 -; CHECK-NEXT: [[L_14:%.*]] = load i8, ptr [[GEP_14]], align 1 -; CHECK-NEXT: [[CMP_14:%.*]] = icmp ugt i8 [[L_14]], -1 -; CHECK-NEXT: [[S_14:%.*]] = select i1 [[CMP_14]], i8 [[L_14]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_14]], ptr [[GEP_14]], align 2 -; CHECK-NEXT: [[GEP_15:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 15 -; CHECK-NEXT: [[L_15:%.*]] = load i8, ptr [[GEP_15]], align 1 -; CHECK-NEXT: [[CMP_15:%.*]] = icmp ugt i8 [[L_15]], -1 -; CHECK-NEXT: [[S_15:%.*]] = select i1 [[CMP_15]], i8 [[L_15]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_15]], ptr [[GEP_15]], align 2 +; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 +; CHECK-NEXT: [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1 +; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 +; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 +; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll index b5602c795f42b..9c8569fcbdf75 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -16,15 +16,10 @@ define i32 @test(ptr nocapture %G) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[G:%.*]], i64 5 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[G]], align 8 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], +; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[G]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll index 740bf71f60e44..e7239f906c59d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -5,15 +5,9 @@ define dso_local <4 x float> @foo(<4 x i32> %0) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0:%.*]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %2 = extractelement <4 x i32> %0, i32 1 %3 = sitofp i32 %2 to float From 4bb40fac621e36794b26afd637ceb238c006b90c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 6 Sep 2023 13:25:34 -0700 Subject: [PATCH 2/2] Rebase --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 09a12ae44ac94..ffd58ca74109a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5614,14 +5614,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, !isConstant(V); })) || !llvm::has_single_bit(NumUniqueScalarValues)) { - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); if (DoNotFail && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && all_of(UniqueValues, [=](Value *V) { return isa(V) || - areAllUsersVectorized(cast(V), IgnoredVals); + areAllUsersVectorized(cast(V), + UserIgnoreList); })) { unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); if (PWSz == VL.size()) {