diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ba47cef274bec..90d92e0fcf55c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1381,16 +1381,16 @@ class TargetTransformInfo { const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; - /// \return The cost of a shuffle instruction of kind Kind and of type Tp. - /// The exact mask may be passed as Mask, or else the array will be empty. - /// The index and subtype parameters are used by the subvector insertion and - /// extraction shuffle kinds to show the insert/extract point and the type of - /// the subvector being inserted/extracted. The operands of the shuffle can be - /// passed through \p Args, which helps improve the cost estimation in some - /// cases, like in broadcast loads. - /// NOTE: For subvector extractions Tp represents the source type. + /// \return The cost of a shuffle instruction of kind Kind with inputs of type + /// SrcTy, producing a vector of type DstTy. The exact mask may be passed as + /// Mask, or else the array will be empty. The Index and SubTp parameters + /// are used by the subvector insertions shuffle kinds to show the insert + /// point and the type of the subvector being inserted. The operands of the + /// shuffle can be passed through \p Args, which helps improve the cost + /// estimation in some cases, like in broadcast loads. LLVM_ABI InstructionCost getShuffleCost( - ShuffleKind Kind, VectorType *Tp, ArrayRef Mask = {}, + ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask = {}, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, VectorType *SubTp = nullptr, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 640766cf8cd10..c22928c9bcd94 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -710,9 +710,9 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const { return 1; } @@ -1541,13 +1541,14 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { return 0; if (Shuffle->isExtractSubvectorMask(SubIndex)) - return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy, - Mask, CostKind, SubIndex, VecTy, - Operands, Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecTy, + VecSrcTy, Mask, CostKind, SubIndex, + VecTy, Operands, Shuffle); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( - TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex, + TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind, + SubIndex, FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands, Shuffle); @@ -1576,21 +1577,24 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { return TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy, - AdjustMask, CostKind, 0, nullptr, Operands, Shuffle); + VecTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle); } // Narrowing shuffle - perform shuffle at original wider width and // then extract the lower elements. + // FIXME: This can assume widening, which is not true of all vector + // architectures (and is not even the default). AdjustMask.append(NumSubElts - Mask.size(), PoisonMaskElem); InstructionCost ShuffleCost = TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle); + VecSrcTy, VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands, + Shuffle); SmallVector ExtractMask(Mask.size()); std::iota(ExtractMask.begin(), ExtractMask.end(), 0); return ShuffleCost + TargetTTI->getShuffleCost( - TTI::SK_ExtractSubvector, VecSrcTy, + TTI::SK_ExtractSubvector, VecTy, VecSrcTy, ExtractMask, CostKind, 0, VecTy, {}, Shuffle); } @@ -1598,40 +1602,44 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { return 0; if (Shuffle->isReverse()) - return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, Mask, CostKind, - 0, nullptr, Operands, Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, VecSrcTy, Mask, + CostKind, 0, nullptr, Operands, + Shuffle); if (Shuffle->isSelect()) - return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, Mask, CostKind, - 0, nullptr, Operands, Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, VecSrcTy, Mask, + CostKind, 0, nullptr, Operands, + Shuffle); if (Shuffle->isTranspose()) - return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, Mask, - CostKind, 0, nullptr, Operands, + return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, VecSrcTy, + Mask, CostKind, 0, nullptr, Operands, Shuffle); if (Shuffle->isZeroEltSplat()) - return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, Mask, - CostKind, 0, nullptr, Operands, + return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, VecSrcTy, + Mask, CostKind, 0, nullptr, Operands, Shuffle); if (Shuffle->isSingleSource()) - return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, - CostKind, 0, nullptr, Operands, - Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, + VecSrcTy, Mask, CostKind, 0, nullptr, + Operands, Shuffle); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( - TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex, + TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind, SubIndex, FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands, Shuffle); if (Shuffle->isSplice(SubIndex)) - return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, Mask, CostKind, - SubIndex, nullptr, Operands, Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, VecSrcTy, Mask, + CostKind, SubIndex, nullptr, Operands, + Shuffle); - return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask, - CostKind, 0, nullptr, Operands, Shuffle); + return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, VecSrcTy, + Mask, CostKind, 0, nullptr, Operands, + Shuffle); } case Instruction::ExtractElement: { auto *EEI = dyn_cast(U); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 90a75c3d352e4..0477c1b6f1a6f 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -329,11 +329,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Cost the call + mask. auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind); - if (VD->isMasked()) - Cost += thisT()->getShuffleCost( - TargetTransformInfo::SK_Broadcast, - VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0, - nullptr, {}); + if (VD->isMasked()) { + auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF); + Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, + VecTy, {}, CostKind, 0, nullptr, {}); + } // Lowering to a library call (with output pointers) may require us to emit // reloads for the results. @@ -1101,11 +1101,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef Mask, - VectorType *Ty, int &Index, + VectorType *SrcTy, int &Index, VectorType *&SubTy) const { if (Mask.empty()) return Kind; - int NumSrcElts = Ty->getElementCount().getKnownMinValue(); + int NumSrcElts = SrcTy->getElementCount().getKnownMinValue(); switch (Kind) { case TTI::SK_PermuteSingleSrc: { if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) @@ -1116,7 +1116,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return TTI::SK_Broadcast; if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && (Index + Mask.size()) <= (size_t)NumSrcElts) { - SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); + SubTy = FixedVectorType::get(SrcTy->getElementType(), Mask.size()); return TTI::SK_ExtractSubvector; } break; @@ -1127,7 +1127,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Mask, NumSrcElts, NumSubElts, Index)) { if (Index + NumSubElts > NumSrcElts) return Kind; - SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); + SubTy = FixedVectorType::get(SrcTy->getElementType(), NumSubElts); return TTI::SK_InsertSubvector; } if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) @@ -1151,13 +1151,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override { - switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { + switch (improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp)) { case TTI::SK_Broadcast: - if (auto *FVT = dyn_cast(Tp)) + if (auto *FVT = dyn_cast(SrcTy)) return getBroadcastShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_Select: @@ -1166,14 +1166,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case TTI::SK_Transpose: case TTI::SK_PermuteSingleSrc: case TTI::SK_PermuteTwoSrc: - if (auto *FVT = dyn_cast(Tp)) + if (auto *FVT = dyn_cast(SrcTy)) return getPermuteShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: - return getExtractSubvectorOverhead(Tp, CostKind, Index, + return getExtractSubvectorOverhead(SrcTy, CostKind, Index, cast(SubTp)); case TTI::SK_InsertSubvector: - return getInsertSubvectorOverhead(Tp, CostKind, Index, + return getInsertSubvectorOverhead(DstTy, CostKind, Index, cast(SubTp)); } llvm_unreachable("Unknown TTI::ShuffleKind"); @@ -1910,6 +1910,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[1])->getZExtValue(); return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, + cast(RetTy), cast(Args[0]->getType()), {}, CostKind, Index, cast(RetTy)); } @@ -1920,17 +1921,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost( - TTI::SK_InsertSubvector, cast(Args[0]->getType()), {}, - CostKind, Index, cast(Args[1]->getType())); + TTI::SK_InsertSubvector, cast(RetTy), + cast(Args[0]->getType()), {}, CostKind, Index, + cast(Args[1]->getType())); } case Intrinsic::vector_reverse: { - return thisT()->getShuffleCost(TTI::SK_Reverse, + return thisT()->getShuffleCost(TTI::SK_Reverse, cast(RetTy), cast(Args[0]->getType()), {}, CostKind, 0, cast(RetTy)); } case Intrinsic::vector_splice: { unsigned Index = cast(Args[2])->getZExtValue(); - return thisT()->getShuffleCost(TTI::SK_Splice, + return thisT()->getShuffleCost(TTI::SK_Splice, cast(RetTy), cast(Args[0]->getType()), {}, CostKind, Index, cast(RetTy)); } @@ -2376,8 +2378,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { CostKind, 1, nullptr, nullptr); Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy, CostKind, 0, nullptr, nullptr); - Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, {}, CostKind, - 0, nullptr); + Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, SearchTy, {}, + CostKind, 0, nullptr); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy, CmpInst::ICMP_EQ, CostKind); Cost += @@ -2961,8 +2963,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { while (NumVecElts > MVTLen) { NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); - ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, - CostKind, NumVecElts, SubTy); + ShuffleCost += thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; ++LongVectorCount; @@ -2978,7 +2980,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // By default reductions need one shuffle per reduction level. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, - {}, CostKind, 0, Ty); + Ty, {}, CostKind, 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + @@ -3052,8 +3054,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { NumVecElts /= 2; auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); - ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, - CostKind, NumVecElts, SubTy); + ShuffleCost += thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy); IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); @@ -3069,7 +3071,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // architecture-dependent length. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, - {}, CostKind, 0, Ty); + Ty, {}, CostKind, 0, Ty); IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); // The last min/max should be in vector registers and we counted it above. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8cc7f8a9d2ab2..3ebd9d487ba04 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -985,11 +985,16 @@ InstructionCost TargetTransformInfo::getAltInstrCost( } InstructionCost TargetTransformInfo::getShuffleCost( - ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, + ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args, const Instruction *CxtI) const { - InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind, - Index, SubTp, Args, CxtI); + assert((Mask.empty() || DstTy->isScalableTy() || + Mask.size() == DstTy->getElementCount().getKnownMinValue()) && + "Expected the Mask to match the return size if given"); + assert(SrcTy->getScalarType() == DstTy->getScalarType() && + "Expected the same scalar types"); + InstructionCost Cost = TTIImpl->getShuffleCost( + Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp, Args, CxtI); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9d5c984fa4f16..8c6f272a8c8da 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5468,19 +5468,25 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Cost; } -InstructionCost AArch64TTIImpl::getShuffleCost( - TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args, const Instruction *CxtI) const { - std::pair LT = getTypeLegalizationCost(Tp); +InstructionCost +AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, + VectorType *SrcTy, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args, + const Instruction *CxtI) const { + assert((Mask.empty() || DstTy->isScalableTy() || + Mask.size() == DstTy->getElementCount().getKnownMinValue()) && + "Expected the Mask to match the return size if given"); + assert(SrcTy->getScalarType() == DstTy->getScalarType() && + "Expected the same scalar types"); + std::pair LT = getTypeLegalizationCost(SrcTy); // If we have a Mask, and the LT is being legalized somehow, split the Mask // into smaller vectors and sum the cost of each shuffle. - if (!Mask.empty() && isa(Tp) && LT.second.isVector() && + if (!Mask.empty() && isa(SrcTy) && LT.second.isVector() && LT.second.getScalarSizeInBits() * Mask.size() > 128 && - Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && + SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { - // Check for LD3/LD4 instructions, which are represented in llvm IR as // deinterleaving-shuffle(load). The shuffle cost could potentially be free, // but we model it with a cost of LT.first so that LD3/LD4 have a higher @@ -5496,16 +5502,16 @@ InstructionCost AArch64TTIImpl::getShuffleCost( // cost than just the store. if (CxtI && CxtI->hasOneUse() && isa(*CxtI->user_begin()) && (ShuffleVectorInst::isInterleaveMask( - Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || + Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) || ShuffleVectorInst::isInterleaveMask( - Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) + Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2))) return LT.first; unsigned TpNumElts = Mask.size(); unsigned LTNumElts = LT.second.getVectorNumElements(); unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; - VectorType *NTp = - VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); + VectorType *NTp = VectorType::get(SrcTy->getScalarType(), + LT.second.getVectorElementCount()); InstructionCost Cost; std::map>, InstructionCost> PreviousCosts; @@ -5513,7 +5519,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost( SmallVector NMask; // Split the existing mask into chunks of size LTNumElts. Track the source // sub-vectors to ensure the result has at most 2 inputs. - unsigned Source1 = 0, Source2 = 0; + unsigned Source1 = -1U, Source2 = -1U; unsigned NumSources = 0; for (unsigned E = 0; E < LTNumElts; E++) { int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] @@ -5561,7 +5567,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost( NumSources <= 2 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - NTp, NMask, CostKind, 0, nullptr, Args, CxtI) + NTp, NTp, NMask, CostKind, 0, nullptr, Args, + CxtI) : LTNumElts; Result.first->second = NCost; Cost += NCost; @@ -5569,7 +5576,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost( return Cost; } - Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; // A subvector extract can be implemented with an ext (or trivial extract, if // from lane 0). This currently only handles low or high extracts to prevent @@ -5585,6 +5592,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost( } Kind = TTI::SK_PermuteSingleSrc; } + // FIXME: This was added to keep the costs equal when adding DstTys. Update + // the code to handle length-changing shuffles. + if (Kind == TTI::SK_InsertSubvector) { + LT = getTypeLegalizationCost(DstTy); + SrcTy = DstTy; + } // Check for broadcast loads, which are supported by the LD1R instruction. // In terms of code-size, the shuffle vector is free when a load + dup get @@ -5596,15 +5609,17 @@ InstructionCost AArch64TTIImpl::getShuffleCost( if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { bool IsLoad = !Args.empty() && isa(Args[0]); if (IsLoad && LT.second.isVector() && - isLegalBroadcastLoad(Tp->getElementType(), + isLegalBroadcastLoad(SrcTy->getElementType(), LT.second.getVectorElementCount())) return 0; } // If we have 4 elements for the shuffle and a Mask, get the cost straight // from the perfect shuffle tables. - if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && - (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && + if (Mask.size() == 4 && + SrcTy->getElementCount() == ElementCount::getFixed(4) && + (SrcTy->getScalarSizeInBits() == 16 || + SrcTy->getScalarSizeInBits() == 32) && all_of(Mask, [](int E) { return E < 8; })) return getPerfectShuffleCost(Mask); @@ -5764,8 +5779,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost( return LT.first * Entry->Cost; } - if (Kind == TTI::SK_Splice && isa(Tp)) - return getSpliceCost(Tp, Index, CostKind); + if (Kind == TTI::SK_Splice && isa(SrcTy)) + return getSpliceCost(SrcTy, Index, CostKind); // Inserting a subvector can often be done with either a D, S or H register // move, so long as the inserted vector is "aligned". @@ -5783,8 +5798,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost( // Restore optimal kind. if (IsExtractSubvector) Kind = TTI::SK_ExtractSubvector; - return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, - CxtI); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp, + Args, CxtI); } static bool containsDecreasingPointers(Loop *TheLoop, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 470af01be3154..9ada70bd7086a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -451,9 +451,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost getScalarizationOverhead( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index d5a1aaef4ad68..5e41273556d3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1183,21 +1183,23 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *VT, ArrayRef Mask, + VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args, const Instruction *CxtI) const { - if (!isa(VT)) - return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); + if (!isa(SrcTy)) + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); - Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); // Larger vector widths may require additional instructions, but are // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast(VT)->getNumElements(); + unsigned NumVectorElts = cast(SrcTy)->getNumElements(); if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && - DL.getTypeSizeInBits(VT->getElementType()) == 16) { + DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) { bool HasVOP3P = ST->hasVOP3PInsts(); unsigned RequestedElts = count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); @@ -1239,7 +1241,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, } } - return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); } /// Whether it is profitable to sink the operands of an diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 0fae301abf532..64a244e33f18f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -236,9 +236,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { InstructionCost getVectorSplitCost() const { return 0; } InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; bool isProfitableToSinkOperands(Instruction *I, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6c3a1ae7e1775..203fb76d7be86 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1233,12 +1233,19 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const { } InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *Tp, ArrayRef Mask, + VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args, const Instruction *CxtI) const { - Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); + assert((Mask.empty() || DstTy->isScalableTy() || + Mask.size() == DstTy->getElementCount().getKnownMinValue()) && + "Expected the Mask to match the return size if given"); + assert(SrcTy->getScalarType() == DstTy->getScalarType() && + "Expected the same scalar types"); + + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); // Treat extractsubvector as single op permutation. bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; if (IsExtractSubvector) @@ -1259,7 +1266,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1280,7 +1287,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1304,7 +1311,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1320,7 +1327,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost * @@ -1328,7 +1335,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, } if (!Mask.empty()) { - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); if (LT.second.isVector() && Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || @@ -1340,11 +1347,11 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Restore optimal kind. if (IsExtractSubvector) Kind = TTI::SK_ExtractSubvector; - int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() + int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy() ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) : 1; - return BaseCost * - BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); + return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, + Index, SubTp); } InstructionCost ARMTTIImpl::getArithmeticInstrCost( diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index c1af4e3dc5da6..ca06b9e3cb661 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -223,9 +223,9 @@ class ARMTTIImpl final : public BasicTTIImplBase { int getNumMemOps(const IntrinsicInst *I) const; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index a4cc472fdbf29..9fb7d471fd22a 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -226,10 +226,12 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, CostKind); } -InstructionCost HexagonTTIImpl::getShuffleCost( - TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args, const Instruction *CxtI) const { +InstructionCost +HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, + VectorType *SrcTy, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args, + const Instruction *CxtI) const { return 1; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index c03cad4713e40..af8dede723083 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,9 +123,9 @@ class HexagonTTIImpl final : public BasicTTIImplBase { unsigned AddressSpace, TTI::TargetCostKind CostKind) const override; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index cd9b226ca82dc..2fba090f2d501 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -604,19 +604,20 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost( } InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *Tp, ArrayRef Mask, + VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args, const Instruction *CxtI) const { InstructionCost CostFactor = - vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr); + vectorCostAdjustmentFactor(Instruction::ShuffleVector, SrcTy, nullptr); if (!CostFactor.isValid()) return InstructionCost::getMax(); // Legalize the type. - std::pair LT = getTypeLegalizationCost(Tp); + std::pair LT = getTypeLegalizationCost(SrcTy); // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index bc5f7a4d06de1..475472ac3720f 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -111,9 +111,9 @@ class PPCTTIImpl final : public BasicTTIImplBase { ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 1b80b0fcaf10a..67a51c12b508e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -452,12 +452,16 @@ static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg)) .second) return; - Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, - RegMask, CostKind, 0, nullptr); + Cost += TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, + FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()), + SingleOpTy, RegMask, CostKind, 0, nullptr); }, [&](ArrayRef RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) { - Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, - CostKind, 0, nullptr); + Cost += TTI.getShuffleCost( + TTI::SK_PermuteTwoSrc, + FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()), + SingleOpTy, RegMask, CostKind, 0, nullptr); }); return Cost; } @@ -526,11 +530,11 @@ costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, return; ++NumShuffles; Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, - RegMask, CostKind, 0, nullptr); + SingleOpTy, RegMask, CostKind, 0, nullptr); }, [&](ArrayRef RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) { - Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, - CostKind, 0, nullptr); + Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, + SingleOpTy, RegMask, CostKind, 0, nullptr); NumShuffles += 2; }); // Note: check that we do not emit too many shuffles here to prevent code @@ -601,22 +605,29 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp, return FirstSlideCost + SecondSlideCost + MaskCost; } -InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, - int Index, VectorType *SubTp, - ArrayRef Args, - const Instruction *CxtI) const { - Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); - std::pair LT = getTypeLegalizationCost(Tp); +InstructionCost +RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, + VectorType *SrcTy, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args, + const Instruction *CxtI) const { + assert((Mask.empty() || DstTy->isScalableTy() || + Mask.size() == DstTy->getElementCount().getKnownMinValue()) && + "Expected the Mask to match the return size if given"); + assert(SrcTy->getScalarType() == DstTy->getScalarType() && + "Expected the same scalar types"); + + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); + std::pair LT = getTypeLegalizationCost(SrcTy); // First, handle cases where having a fixed length vector enables us to // give a more accurate cost than falling back to generic scalable codegen. // TODO: Each of these cases hints at a modeling gap around scalable vectors. - if (auto *FVTp = dyn_cast(Tp); + if (auto *FVTp = dyn_cast(SrcTy); FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) { InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting( - *this, LT.second, ST->getRealVLen(), Tp, Mask, CostKind); + *this, LT.second, ST->getRealVLen(), + Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind); if (VRegSplittingCost.isValid()) return VRegSplittingCost; switch (Kind) { @@ -655,7 +666,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, for (unsigned I = 0; I != NumSlides; ++I) { unsigned InsertIndex = SubVectorSize * (1 << I); FixedVectorType *SubTp = - FixedVectorType::get(Tp->getElementType(), InsertIndex); + FixedVectorType::get(SrcTy->getElementType(), InsertIndex); FixedVectorType *DestTp = FixedVectorType::getDoubleElementsVectorType(SubTp); std::pair DestLT = @@ -664,7 +675,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // destination vector register group for vslideup cannot overlap the // source. Cost += DestLT.first * TLI->getLMULCost(DestLT.second); - Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {}, + Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {}, CostKind, InsertIndex, SubTp); } return Cost; @@ -680,7 +691,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 || LT.second.getVectorNumElements() <= 256)) { VectorType *IdxTy = - getVRGatherIndexType(LT.second, *ST, Tp->getContext()); + getVRGatherIndexType(LT.second, *ST, SrcTy->getContext()); InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); return IndexCost + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); @@ -699,8 +710,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // (shuffle) mask. if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 || LT.second.getVectorNumElements() <= 256)) { - auto &C = Tp->getContext(); - auto EC = Tp->getElementCount(); + auto &C = SrcTy->getContext(); + auto EC = SrcTy->getElementCount(); VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); @@ -769,6 +780,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Example sequence: // vsetivli zero, 4, e8, mf2, tu, ma (ignored) // vslideup.vi v8, v9, 2 + LT = getTypeLegalizationCost(DstTy); return LT.first * getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); case TTI::SK_Select: { @@ -846,14 +858,15 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // TODO: Cases to improve here: // * Illegal vector types // * i64 on RV32 - if (Tp->getElementType()->isIntegerTy(1)) { + if (SrcTy->getElementType()->isIntegerTy(1)) { VectorType *WideTy = - VectorType::get(IntegerType::get(Tp->getContext(), 8), - cast(Tp)->getElementCount()); - return getCastInstrCost(Instruction::ZExt, WideTy, Tp, + VectorType::get(IntegerType::get(SrcTy->getContext(), 8), + cast(SrcTy)->getElementCount()); + return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy, TTI::CastContextHint::None, CostKind) + - getShuffleCost(TTI::SK_Reverse, WideTy, {}, CostKind, 0, nullptr) + - getCastInstrCost(Instruction::Trunc, Tp, WideTy, + getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0, + nullptr) + + getCastInstrCost(Instruction::Trunc, SrcTy, WideTy, TTI::CastContextHint::None, CostKind); } @@ -899,7 +912,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return FixedCost + LT.first * (GatherCost + SlideCost); } } - return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); } static unsigned isM1OrSmaller(MVT VT) { @@ -1025,8 +1039,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( auto Mask = createStrideMask(Index, Factor, VF); Mask.resize(VF * Factor, -1); InstructionCost ShuffleCost = - getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, Mask, - CostKind, 0, nullptr, {}); + getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy, + Mask, CostKind, 0, nullptr, {}); Cost += ShuffleCost; } return Cost; @@ -1052,7 +1066,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( // shuffle that goes into the wide store auto Mask = createInterleaveMask(VF, Factor); InstructionCost ShuffleCost = - getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, + getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask, CostKind, 0, nullptr, {}); return MemCost + ShuffleCost; } @@ -1523,7 +1537,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // To support type-based query from vectorizer, set the index to 0. // Note that index only change the cost from vslide.vx to vslide.vi and in // current implementations they have same costs. - return getShuffleCost(TTI::SK_Splice, + return getShuffleCost(TTI::SK_Splice, cast(ICA.getReturnType()), cast(ICA.getArgTypes()[0]), {}, CostKind, 0, cast(ICA.getReturnType())); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 83ac71ed9da69..12bf8c1b4de70 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -160,9 +160,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase { } InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost getScalarizationOverhead( diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 68ba7498d586b..f32c9bd2bdea1 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -738,20 +738,22 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( Args, CxtI); } -InstructionCost SystemZTTIImpl::getShuffleCost( - TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args, const Instruction *CxtI) const { - Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); +InstructionCost +SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, + VectorType *SrcTy, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args, + const Instruction *CxtI) const { + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); if (ST->hasVector()) { - unsigned NumVectors = getNumVectorRegs(Tp); + unsigned NumVectors = getNumVectorRegs(SrcTy); // TODO: Since fp32 is expanded, the shuffle cost should always be 0. // FP128 values are always in scalar registers, so there is no work // involved with a shuffle, except for broadcast. In that case register // moves are done with a single instruction per element. - if (Tp->getScalarType()->isFP128Ty()) + if (SrcTy->getScalarType()->isFP128Ty()) return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); switch (Kind) { @@ -775,7 +777,8 @@ InstructionCost SystemZTTIImpl::getShuffleCost( } } - return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); } // Return the log2 difference of the element sizes of the two vector types. diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 368a4af768b3e..dc5736e8af009 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -102,9 +102,9 @@ class SystemZTTIImpl final : public BasicTTIImplBase { ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy) const; unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) const; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index a1a177528eb23..6a05a1700f0cb 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1522,15 +1522,24 @@ X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, return InstructionCost::getInvalid(); } -InstructionCost X86TTIImpl::getShuffleCost( - TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args, const Instruction *CxtI) const { +InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, + VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, VectorType *SubTp, + ArrayRef Args, + const Instruction *CxtI) const { + assert((Mask.empty() || DstTy->isScalableTy() || + Mask.size() == DstTy->getElementCount().getKnownMinValue()) && + "Expected the Mask to match the return size if given"); + assert(SrcTy->getScalarType() == DstTy->getScalarType() && + "Expected the same scalar types"); + // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. - std::pair LT = getTypeLegalizationCost(BaseTp); + std::pair LT = getTypeLegalizationCost(SrcTy); - Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp); + Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); // If all args are constant than this will be constant folded away. if (!Args.empty() && @@ -1539,11 +1548,12 @@ InstructionCost X86TTIImpl::getShuffleCost( // Recognize a basic concat_vector shuffle. if (Kind == TTI::SK_PermuteTwoSrc && - Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) && + Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) && ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) return getShuffleCost(TTI::SK_InsertSubvector, - VectorType::getDoubleElementsVectorType(BaseTp), Mask, - CostKind, Mask.size() / 2, BaseTp); + VectorType::getDoubleElementsVectorType(SrcTy), + VectorType::getDoubleElementsVectorType(SrcTy), Mask, + CostKind, Mask.size() / 2, SrcTy); // Treat Transpose as 2-op shuffles - there's no difference in lowering. if (Kind == TTI::SK_Transpose) @@ -1568,11 +1578,11 @@ InstructionCost X86TTIImpl::getShuffleCost( // Attempt to detect a shuffle mask with a single defined element. bool IsInLaneShuffle = false; bool IsSingleElementMask = false; - if (BaseTp->getPrimitiveSizeInBits() > 0 && - (BaseTp->getPrimitiveSizeInBits() % 128) == 0 && - BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && - Mask.size() == BaseTp->getElementCount().getKnownMinValue()) { - unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128; + if (SrcTy->getPrimitiveSizeInBits() > 0 && + (SrcTy->getPrimitiveSizeInBits() % 128) == 0 && + SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && + Mask.size() == SrcTy->getElementCount().getKnownMinValue()) { + unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128; unsigned NumEltsPerLane = Mask.size() / NumLanes; if ((Mask.size() % NumLanes) == 0) { IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) { @@ -1614,16 +1624,17 @@ InstructionCost X86TTIImpl::getShuffleCost( LT.second.getVectorElementType() == SubLT.second.getVectorElementType() && LT.second.getVectorElementType().getSizeInBits() == - BaseTp->getElementType()->getPrimitiveSizeInBits()) { + SrcTy->getElementType()->getPrimitiveSizeInBits()) { assert(NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!"); - auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), + auto *VecTy = FixedVectorType::get(SrcTy->getElementType(), LT.second.getVectorNumElements()); - auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), + auto *SubTy = FixedVectorType::get(SrcTy->getElementType(), SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); - InstructionCost ExtractCost = getShuffleCost( - TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy); + InstructionCost ExtractCost = + getShuffleCost(TTI::SK_ExtractSubvector, VecTy, VecTy, {}, CostKind, + ExtractIndex, SubTy); // If the original size is 32-bits or more, we can use pshufd. Otherwise // if we have SSSE3 we can use pshufb. @@ -1646,7 +1657,8 @@ InstructionCost X86TTIImpl::getShuffleCost( // but if the destination vector legalizes to the same width as the subvector // then the insertion will simplify to a (free) register copy. if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { - int NumElts = LT.second.getVectorNumElements(); + std::pair DstLT = getTypeLegalizationCost(DstTy); + int NumElts = DstLT.second.getVectorNumElements(); std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); @@ -1670,7 +1682,7 @@ InstructionCost X86TTIImpl::getShuffleCost( // Handle some common (illegal) sub-vector types as they are often very cheap // to shuffle even on targets without PSHUFB. - EVT VT = TLI->getValueType(DL, BaseTp); + EVT VT = TLI->getValueType(DL, SrcTy); if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && !ST->hasSSSE3()) { static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = { @@ -1717,17 +1729,17 @@ InstructionCost X86TTIImpl::getShuffleCost( MVT LegalVT = LT.second; if (LegalVT.isVector() && LegalVT.getVectorElementType().getSizeInBits() == - BaseTp->getElementType()->getPrimitiveSizeInBits() && + SrcTy->getElementType()->getPrimitiveSizeInBits() && LegalVT.getVectorNumElements() < - cast(BaseTp)->getNumElements()) { - unsigned VecTySize = DL.getTypeStoreSize(BaseTp); + cast(SrcTy)->getNumElements()) { + unsigned VecTySize = DL.getTypeStoreSize(SrcTy); unsigned LegalVTSize = LegalVT.getStoreSize(); // Number of source vectors after legalization: unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; // Number of destination vectors after legalization: InstructionCost NumOfDests = LT.first; - auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), + auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(), LegalVT.getVectorNumElements()); if (!Mask.empty() && NumOfDests.isValid()) { @@ -1746,7 +1758,7 @@ InstructionCost X86TTIImpl::getShuffleCost( // this operation is TTI::TCC_Free. NumOfDests = getTypeLegalizationCost( - FixedVectorType::get(BaseTp->getElementType(), Mask.size())) + FixedVectorType::get(SrcTy->getElementType(), Mask.size())) .first; unsigned E = NumOfDests.getValue(); unsigned NormalizedVF = @@ -1767,8 +1779,9 @@ InstructionCost X86TTIImpl::getShuffleCost( // one. if (PrevRegMask.empty() || PrevSrcReg != SrcReg || PrevRegMask != RegMask) - Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, - RegMask, CostKind, 0, nullptr); + Cost += + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, + SingleOpTy, RegMask, CostKind, 0, nullptr); else // Just a copy of previous destination register. Cost += TTI::TCC_Basic; @@ -1785,18 +1798,20 @@ InstructionCost X86TTIImpl::getShuffleCost( [this, SingleOpTy, CostKind, &Cost](ArrayRef RegMask, unsigned /*Unused*/, unsigned /*Unused*/, bool /*Unused*/) { - Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, - CostKind, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, + SingleOpTy, RegMask, CostKind, 0, nullptr); }); return Cost; } InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, - {}, CostKind, 0, nullptr); + SingleOpTy, {}, CostKind, 0, + nullptr); } - return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); } // If we're just moving a single element around (probably as an alternative to @@ -2229,7 +2244,7 @@ InstructionCost X86TTIImpl::getShuffleCost( if (ST->hasSSE3() && IsLoad) if (const auto *Entry = CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { - assert(isLegalBroadcastLoad(BaseTp->getElementType(), + assert(isLegalBroadcastLoad(SrcTy->getElementType(), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()"); return LT.first * Entry->Cost; @@ -2263,7 +2278,8 @@ InstructionCost X86TTIImpl::getShuffleCost( return LT.first * *KindCost; } - return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, + SubTp); } InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, @@ -4903,8 +4919,8 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, EVT VT = TLI->getValueType(DL, Val); if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) SubTy = FixedVectorType::get(ScalarType, SubNumElts); - ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy); + ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {}, + CostKind, 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; @@ -4999,8 +5015,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead( // FIXME: we don't need to extract if all non-demanded elements // are legalization-inserted padding. if (!LaneEltMask.isAllOnes()) - Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, - I * NumEltsPerLane, LaneTy); + Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, + CostKind, I * NumEltsPerLane, LaneTy); Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, /*Extract*/ false, CostKind); } @@ -5017,8 +5033,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead( if (!AffectedLanes[I] || (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) continue; - Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, {}, CostKind, - I * NumEltsPerLane, LaneTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, Ty, {}, + CostKind, I * NumEltsPerLane, LaneTy); } } } @@ -5077,7 +5093,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead( NumEltsPerLane, I * NumEltsPerLane); if (LaneEltMask.isZero()) continue; - Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, + Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, CostKind, I * NumEltsPerLane, LaneTy); Cost += BaseT::getScalarizationOverhead( LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); @@ -5195,9 +5211,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); - InstructionCost SingleShuffleCost = getShuffleCost( - TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind, - /*Index=*/0, /*SubTp=*/nullptr); + InstructionCost SingleShuffleCost = + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy, + /*Mask=*/{}, CostKind, + /*Index=*/0, /*SubTp=*/nullptr); return NumDstVectorsDemanded * SingleShuffleCost; } @@ -5338,9 +5355,10 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, SubVecEltsLeft += CurrVecTy->getNumElements(); // And that's free only for the 0'th subvector of a legalized vector. if (!Is0thSubVec) - Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector - : TTI::ShuffleKind::SK_ExtractSubvector, - VTy, {}, CostKind, NumEltDone(), CurrVecTy); + Cost += + getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector + : TTI::ShuffleKind::SK_ExtractSubvector, + VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy); } // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, @@ -5416,17 +5434,17 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, if (VT.isSimple() && Ty != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires extend/truncate for data and a shuffle for mask. - Cost += - getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, {}, CostKind, 0, - nullptr) + - getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind, + 0, nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind, + 0, nullptr); else if (LT.first * Ty.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), Ty.getVectorNumElements()); // Expanding requires fill mask with zeroes - Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, {}, CostKind, 0, - MaskTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {}, + CostKind, 0, MaskTy); } // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. @@ -5690,7 +5708,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, + ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, CostKind, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { @@ -5702,8 +5720,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); - ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, - CostKind, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, + {}, CostKind, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -5713,8 +5731,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); - ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, - CostKind, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, + {}, CostKind, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -5872,8 +5890,8 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind, - NumVecElts, SubTy); + MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, + CostKind, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -5883,7 +5901,7 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); - MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {}, CostKind, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. @@ -5892,7 +5910,7 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); - MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {}, + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {}, CostKind, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. @@ -6678,8 +6696,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( TTI::ShuffleKind ShuffleKind = (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; - InstructionCost ShuffleCost = - getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr); + InstructionCost ShuffleCost = getShuffleCost( + ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr); unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; @@ -6735,8 +6753,9 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( // There is no strided stores meanwhile. And store can't be folded in // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. - InstructionCost ShuffleCost = getShuffleCost( - TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr); + InstructionCost ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {}, + CostKind, 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 8045f1b1d6637..bc06c4746c3c4 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -149,9 +149,9 @@ class X86TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind) const override; InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = {}, + getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, + ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, ArrayRef Args = {}, const Instruction *CxtI = nullptr) const override; InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index ccb68700747b3..a712b4632e9a8 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1560,9 +1560,9 @@ class LowerMatrixIntrinsics { InstructionCost EmbedCost(0); // Roughly estimate the cost for embedding the columns into a vector. for (unsigned I = 1; I < N; ++I) - EmbedCost += - TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), - {}, TTI::TCK_RecipThroughput); + EmbedCost += TTI.getShuffleCost( + TTI::SK_Splice, FixedVectorType::get(EltTy, 1), + FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput); return EmbedCost; } @@ -1582,9 +1582,9 @@ class LowerMatrixIntrinsics { // vector. InstructionCost EmbedCost(0); for (unsigned I = 1; I < N; ++I) - EmbedCost -= - TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), - {}, TTI::TCK_RecipThroughput); + EmbedCost -= TTI.getShuffleCost( + TTI::SK_Splice, FixedVectorType::get(EltTy, 1), + FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput); return EmbedCost; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f28c2ce0acc98..f4259d3d69880 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5343,8 +5343,8 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, - CostKind, 0); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + VectorTy, {}, CostKind, 0); return Cost; } @@ -5361,8 +5361,8 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {}, - CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, + VectorTy, {}, CostKind); } StoreInst *SI = cast(I); @@ -5428,8 +5428,8 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, - CostKind, 0); + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + VectorTy, {}, CostKind, 0); } return Cost; } @@ -6171,6 +6171,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, SmallVector Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, + cast(VectorTy), cast(VectorTy), Mask, CostKind, VF.getKnownMinValue() - 1); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1141c1b2babbf..cb65c225dcdb6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5717,20 +5717,24 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, VectorType *SubTp = nullptr, ArrayRef Args = {}) { + VectorType *DstTy = Tp; + if (!Mask.empty()) + DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size()); + if (Kind != TTI::SK_PermuteTwoSrc) - return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp, + Args); int NumSrcElts = Tp->getElementCount().getKnownMinValue(); int NumSubElts; if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( Mask, NumSrcElts, NumSubElts, Index)) { if (Index + NumSubElts > NumSrcElts && Index + NumSrcElts <= static_cast(Mask.size())) - return TTI.getShuffleCost( - TTI::SK_InsertSubvector, - getWidenedType(Tp->getElementType(), Mask.size()), Mask, - TTI::TCK_RecipThroughput, Index, Tp); + return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask, + TTI::TCK_RecipThroughput, Index, Tp); } - return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp, + Args); } /// This is similar to TargetTransformInfo::getScalarizationOverhead, but if @@ -12036,7 +12040,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (isa(ScalarTy)) { assert(SLPReVec && "FixedVectorType is not expected."); return TTI.getShuffleCost( - TTI::SK_InsertSubvector, VecTy, {}, CostKind, + TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind, std::distance(VL.begin(), It) * getNumElements(ScalarTy), cast(ScalarTy)); } @@ -22995,7 +22999,10 @@ class HorizontalReduction { unsigned ScalarTyNumElements = VecTy->getNumElements(); for (unsigned I : seq(ReducedVals.size())) { VectorCost += TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, VectorTy, + TTI::SK_PermuteSingleSrc, + FixedVectorType::get(VecTy->getScalarType(), + ReducedVals.size()), + VectorTy, createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 39def05b2eacd..e09f0a7dcd814 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -822,6 +822,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, + cast(VectorTy), cast(VectorTy), Mask, Ctx.CostKind, VF.getKnownMinValue() - 1); } @@ -2869,9 +2870,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; - return Cost += - Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, 0); + return Cost += Ctx.TTI.getShuffleCost( + TargetTransformInfo::SK_Reverse, cast(Ty), + cast(Ty), {}, Ctx.CostKind, 0); } void VPWidenLoadRecipe::execute(VPTransformState &State) { @@ -2985,9 +2986,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Cost + Ctx.TTI.getShuffleCost( + TargetTransformInfo::SK_Reverse, cast(Ty), + cast(Ty), {}, Ctx.CostKind, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3098,9 +3099,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Cost + Ctx.TTI.getShuffleCost( + TargetTransformInfo::SK_Reverse, cast(Ty), + cast(Ty), {}, Ctx.CostKind, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3478,7 +3479,8 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, return Cost + IG->getNumMembers() * Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - VectorTy, {}, Ctx.CostKind, 0); + VectorTy, VectorTy, {}, Ctx.CostKind, + 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 52cb1dbb33b86..95e1f96c71b48 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -289,8 +289,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); Mask[0] = OffsetEltIndex; if (OffsetEltIndex) - NewCost += - TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind); + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask, + CostKind); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -510,12 +510,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, PoisonMaskElem); ShuffleMask[BestInsIndex] = BestExtIndex; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - VecTy, ShuffleMask, CostKind, 0, nullptr, - {ConvertToShuffle}); + VecTy, VecTy, ShuffleMask, CostKind, 0, + nullptr, {ConvertToShuffle}); } else { - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, - {}, CostKind, 0, nullptr, {ConvertToShuffle}); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, VecTy, {}, CostKind, 0, nullptr, + {ConvertToShuffle}); } } @@ -712,8 +712,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, - CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy, + Mask, CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, @@ -723,7 +723,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SrcMask.assign(NumElts, PoisonMaskElem); SrcMask[Index] = Index; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - SrcVecTy, SrcMask, CostKind); + VecTy, SrcVecTy, SrcMask, CostKind); } if (NewCost > OldCost) @@ -871,12 +871,12 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) { : TargetTransformInfo::SK_PermuteTwoSrc; InstructionCost NewCost = - TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) + + TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) + (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, TargetTransformInfo::CastContextHint::None, CostKind)); InstructionCost OldCost = - TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) + + TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) + TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, TargetTransformInfo::CastContextHint::None, CostKind); @@ -943,7 +943,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { Mask.resize(FVTy->getNumElements(), 0); InstructionCost SplatCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask, + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, VecTy, Mask, CostKind); // Calculate the cost of the VP Intrinsic @@ -1260,14 +1260,13 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; - auto *CmpTy = cast(CmpInst::makeCmpResultType(X->getType())); + auto *CmpTy = cast(CmpInst::makeCmpResultType(VecTy)); InstructionCost NewCost = TTI.getCmpSelInstrCost( - CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred, - CostKind); + CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); SmallVector ShufMask(VecTy->getNumElements(), PoisonMaskElem); ShufMask[CheapIndex] = ExpensiveIndex; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, - ShufMask, CostKind); + CmpTy, ShufMask, CostKind); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind); NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost; @@ -1783,8 +1782,8 @@ bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) { TTI::CastContextHint::None, CostKind); InstructionCost NewCost = 0; - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy, - ConcatMask, CostKind); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ConcatTy, + MaskTy, ConcatMask, CostKind); NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy, TTI::CastContextHint::None, CostKind); if (Ty != ConcatIntTy) @@ -1889,26 +1888,28 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) { // Try to merge shuffles across the binop if the new shuffles are not costly. InstructionCost OldCost = TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy, - OuterMask, CostKind, 0, nullptr, {BinOp}, &I); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy, + BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I); if (Match0) - OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, - Mask0, CostKind, 0, nullptr, {Op00, Op01}, - cast(BinOp->getOperand(0))); + OldCost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind, + 0, nullptr, {Op00, Op01}, cast(BinOp->getOperand(0))); if (Match1) - OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, - Mask1, CostKind, 0, nullptr, {Op10, Op11}, - cast(BinOp->getOperand(1))); + OldCost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind, + 0, nullptr, {Op10, Op11}, cast(BinOp->getOperand(1))); InstructionCost NewCost = TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); if (!IsIdentity0) - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, - NewMask0, CostKind, 0, nullptr, {Op00, Op01}); + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01}); if (!IsIdentity1) - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, - NewMask1, CostKind, 0, nullptr, {Op10, Op11}); + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11}); LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost @@ -2002,8 +2003,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { InstructionCost OldCost = TTI.getInstructionCost(LHS, CostKind) + TTI.getInstructionCost(RHS, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, - OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS}, + &I); // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns // where one use shuffles have gotten split across the binop/cmp. These @@ -2035,16 +2037,18 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); + auto *ShuffleCmpTy = + FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); InstructionCost NewCost = - TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + - TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); + TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, + nullptr, {X, Z}) + + TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0, + nullptr, {Y, W}); if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) { NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind); } else { - auto *ShuffleCmpTy = - FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy, PredLHS, CostKind); } @@ -2112,15 +2116,17 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) { SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); - OldCost += TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, - {I.getOperand(0), I.getOperand(1)}, &I); + OldCost += + TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr, + {I.getOperand(0), I.getOperand(1)}, &I); - InstructionCost NewCost = - TTI.getShuffleCost(SK, C1VecTy, Mask, CostKind, 0, nullptr, {C1, C2}); - NewCost += - TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {T1, T2}); - NewCost += - TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {F1, F2}); + InstructionCost NewCost = TTI.getShuffleCost( + SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy, + Mask, CostKind, 0, nullptr, {C1, C2}); + NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, + nullptr, {T1, T2}); + NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, + nullptr, {F1, F2}); auto *C1C2ShuffledVecTy = cast( toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements())); NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy, @@ -2220,11 +2226,12 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { TTI::CastContextHint::None, CostKind); InstructionCost OldCost = CostC0 + CostC1; OldCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy, - OldMask, CostKind, 0, nullptr, {}, &I); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I); - InstructionCost NewCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind); + InstructionCost NewCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy, + CastSrcTy, NewMask, CostKind); NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, TTI::CastContextHint::None, CostKind); if (!C0->hasOneUse()) @@ -2363,8 +2370,9 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { TargetTransformInfo::ShuffleKind SK = IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc : TargetTransformInfo::SK_PermuteTwoSrc; - InstructionCost NewCost = TTI.getShuffleCost( - SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY}); + InstructionCost NewCost = + TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0, + nullptr, {NewX, NewY}); if (!OuterV0->hasOneUse()) NewCost += InnerCost0; if (!OuterV1->hasOneUse()) @@ -2415,21 +2423,23 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { InstructionCost OldCost = TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask, - CostKind, 0, nullptr, {II0, II1}, &I); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I); SmallVector NewArgsTy; InstructionCost NewCost = 0; - for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) { if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { NewArgsTy.push_back(II0->getArgOperand(I)->getType()); } else { auto *VecTy = cast(II0->getArgOperand(I)->getType()); - NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(), - ShuffleDstTy->getNumElements())); + auto *ArgTy = FixedVectorType::get(VecTy->getElementType(), + ShuffleDstTy->getNumElements()); + NewArgsTy.push_back(ArgTy); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - VecTy, OldMask, CostKind); + ArgTy, VecTy, OldMask, CostKind); } + } IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind); @@ -2508,7 +2518,9 @@ static bool isFreeConcat(ArrayRef Item, TTI::TargetCostKind CostKind, // during legalization. SmallVector ConcatMask(NumElts * 2); std::iota(ConcatMask.begin(), ConcatMask.end(), 0); - if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0) + if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, + FixedVectorType::get(Ty->getScalarType(), NumElts * 2), + Ty, ConcatMask, CostKind) != 0) return false; unsigned NumSlices = Item.size() / NumElts; @@ -2877,21 +2889,15 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { SmallVector ConcatMask; Shuffle->getShuffleMask(ConcatMask); sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); - // In the case of a truncating shuffle it's possible for the mask - // to have an index greater than the size of the resulting vector. - // This requires special handling. - bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; bool UsesSecondVec = any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); - FixedVectorType *VecTyForCost = - (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; InstructionCost OldCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, - VecTyForCost, Shuffle->getShuffleMask(), CostKind); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + ShuffleInputType, Shuffle->getShuffleMask(), CostKind); InstructionCost NewCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, - VecTyForCost, ConcatMask, CostKind); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + ShuffleInputType, ConcatMask, CostKind); LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"); @@ -3205,10 +3211,11 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { return C + TTI.getShuffleCost(isa(SV->getOperand(1)) ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - VT, SV->getShuffleMask(), CostKind); + VT, VT, SV->getShuffleMask(), CostKind); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef Mask) { - return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind); + return C + + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); }; // Get the costs of the shuffles + binops before and after with the new @@ -3446,8 +3453,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { // Ignore 'free' identity insertion shuffle. // TODO: getShuffleCost should return TCC_Free for Identity shuffles. if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts)) - NewCost += TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind, 0, nullptr, - {DstVec, SrcVec}); + NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0, + nullptr, {DstVec, SrcVec}); } else { // When creating length-changing-vector, always create with a Mask whose // first element has an ExtIdx, so that the first element of the vector @@ -3459,8 +3466,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { ExtToVecMask[0] = ExtIdx; // Add cost for expanding or narrowing NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - SrcVecTy, ExtToVecMask, CostKind); - NewCost += TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind); + DstVecTy, SrcVecTy, ExtToVecMask, CostKind); + NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind); } if (!Ext->hasOneUse())