|
123 | 123 | #include "llvm/IR/User.h"
|
124 | 124 | #include "llvm/IR/Value.h"
|
125 | 125 | #include "llvm/IR/ValueHandle.h"
|
| 126 | +#include "llvm/IR/VectorBuilder.h" |
126 | 127 | #include "llvm/IR/Verifier.h"
|
127 | 128 | #include "llvm/Support/Casting.h"
|
128 | 129 | #include "llvm/Support/CommandLine.h"
|
@@ -247,10 +248,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
|
247 | 248 | clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
|
248 | 249 | "Create lane mask using active.lane.mask intrinsic, and use "
|
249 | 250 | "it for both data and control flow"),
|
250 |
| - clEnumValN( |
251 |
| - TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
252 |
| - "data-and-control-without-rt-check", |
253 |
| - "Similar to data-and-control, but remove the runtime check"))); |
| 251 | + clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
| 252 | + "data-and-control-without-rt-check", |
| 253 | + "Similar to data-and-control, but remove the runtime check"), |
| 254 | + clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", |
| 255 | + "Use predicated EVL instructions for tail folding if the " |
| 256 | + "target supports vector length predication"))); |
254 | 257 |
|
255 | 258 | static cl::opt<bool> MaximizeBandwidth(
|
256 | 259 | "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
@@ -1098,9 +1101,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
|
1098 | 1101 | // handled.
|
1099 | 1102 | if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
|
1100 | 1103 | isa<VPInterleaveRecipe>(CurRec) ||
|
1101 |
| - isa<VPScalarIVStepsRecipe>(CurRec) || |
1102 |
| - isa<VPCanonicalIVPHIRecipe>(CurRec) || |
1103 |
| - isa<VPActiveLaneMaskPHIRecipe>(CurRec)) |
| 1104 | + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec)) |
1104 | 1105 | continue;
|
1105 | 1106 |
|
1106 | 1107 | // This recipe contributes to the address computation of a widen
|
@@ -1640,6 +1641,23 @@ class LoopVectorizationCostModel {
|
1640 | 1641 | return foldTailByMasking() || Legal->blockNeedsPredication(BB);
|
1641 | 1642 | }
|
1642 | 1643 |
|
| 1644 | + /// Returns true if VP intrinsics with explicit vector length support should |
| 1645 | + /// be generated in the tail folded loop. |
| 1646 | + bool useVPIWithVPEVLVectorization() const { |
| 1647 | + return PreferEVL && !EnableVPlanNativePath && |
| 1648 | + getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 1649 | + // FIXME: implement support for max safe dependency distance. |
| 1650 | + Legal->isSafeForAnyVectorWidth() && |
| 1651 | + // FIXME: remove this once reductions are supported. |
| 1652 | + Legal->getReductionVars().empty() && |
| 1653 | + // FIXME: remove this once vp_reverse is supported. |
| 1654 | + none_of( |
| 1655 | + WideningDecisions, |
| 1656 | + [](const std::pair<std::pair<Instruction *, ElementCount>, |
| 1657 | + std::pair<InstWidening, InstructionCost>> |
| 1658 | + &Data) { return Data.second.first == CM_Widen_Reverse; }); |
| 1659 | + } |
| 1660 | + |
1643 | 1661 | /// Returns true if the Phi is part of an inloop reduction.
|
1644 | 1662 | bool isInLoopReduction(PHINode *Phi) const {
|
1645 | 1663 | return InLoopReductions.contains(Phi);
|
@@ -1785,6 +1803,10 @@ class LoopVectorizationCostModel {
|
1785 | 1803 | /// All blocks of loop are to be masked to fold tail of scalar iterations.
|
1786 | 1804 | bool CanFoldTailByMasking = false;
|
1787 | 1805 |
|
| 1806 | + /// Control whether to generate VP intrinsics with explicit-vector-length |
| 1807 | + /// support in vectorized code. |
| 1808 | + bool PreferEVL = false; |
| 1809 | + |
1788 | 1810 | /// A map holding scalar costs for different vectorization factors. The
|
1789 | 1811 | /// presence of a cost for an instruction in the mapping indicates that the
|
1790 | 1812 | /// instruction will be scalarized when vectorizing with the associated
|
@@ -4691,6 +4713,39 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
4691 | 4713 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
|
4692 | 4714 | if (Legal->prepareToFoldTailByMasking()) {
|
4693 | 4715 | CanFoldTailByMasking = true;
|
| 4716 | + if (getTailFoldingStyle() == TailFoldingStyle::None) |
| 4717 | + return MaxFactors; |
| 4718 | + |
| 4719 | + if (UserIC > 1) { |
| 4720 | + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4721 | + "not generate VP intrinsics since interleave count " |
| 4722 | + "specified is greater than 1.\n"); |
| 4723 | + return MaxFactors; |
| 4724 | + } |
| 4725 | + |
| 4726 | + if (MaxFactors.ScalableVF.isVector()) { |
| 4727 | + assert(MaxFactors.ScalableVF.isScalable() && |
| 4728 | + "Expected scalable vector factor."); |
| 4729 | + // FIXME: use actual opcode/data type for analysis here. |
| 4730 | + PreferEVL = getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 4731 | + TTI.hasActiveVectorLength(0, nullptr, Align()); |
| 4732 | +#if !NDEBUG |
| 4733 | + if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { |
| 4734 | + if (PreferEVL) |
| 4735 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4736 | + "try to generate VP Intrinsics.\n"; |
| 4737 | + else |
| 4738 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4739 | + "not try to generate VP Intrinsics since the target " |
| 4740 | + "does not support vector length predication.\n"; |
| 4741 | + } |
| 4742 | +#endif // !NDEBUG |
| 4743 | + |
| 4744 | + // Tail folded loop using VP intrinsics restricts the VF to be scalable. |
| 4745 | + if (PreferEVL) |
| 4746 | + MaxFactors.FixedVF = ElementCount::getFixed(1); |
| 4747 | + } |
| 4748 | + |
4694 | 4749 | return MaxFactors;
|
4695 | 4750 | }
|
4696 | 4751 |
|
@@ -5300,6 +5355,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
5300 | 5355 | if (!isScalarEpilogueAllowed())
|
5301 | 5356 | return 1;
|
5302 | 5357 |
|
| 5358 | + // Do not interleave if EVL is preferred and no User IC is specified. |
| 5359 | + if (useVPIWithVPEVLVectorization()) |
| 5360 | + return 1; |
| 5361 | + |
5303 | 5362 | // We used the distance for the interleave count.
|
5304 | 5363 | if (!Legal->isSafeForAnyVectorWidth())
|
5305 | 5364 | return 1;
|
@@ -8537,6 +8596,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
|
8537 | 8596 | VPlanTransforms::truncateToMinimalBitwidths(
|
8538 | 8597 | *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
|
8539 | 8598 | VPlanTransforms::optimize(*Plan, *PSE.getSE());
|
| 8599 | + if (CM.useVPIWithVPEVLVectorization()) |
| 8600 | + VPlanTransforms::addExplicitVectorLength(*Plan); |
8540 | 8601 | assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
|
8541 | 8602 | VPlans.push_back(std::move(Plan));
|
8542 | 8603 | }
|
@@ -9399,6 +9460,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
9399 | 9460 | State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
|
9400 | 9461 | }
|
9401 | 9462 |
|
| 9463 | +/// Creates either vp_store or vp_scatter intrinsics calls to represent |
| 9464 | +/// predicated store/scatter. |
| 9465 | +static Instruction * |
| 9466 | +lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, |
| 9467 | + Value *StoredVal, bool IsScatter, Value *Mask, |
| 9468 | + Value *EVLPart, const Align &Alignment) { |
| 9469 | + CallInst *Call; |
| 9470 | + if (IsScatter) { |
| 9471 | + Call = Builder.CreateIntrinsic(Type::getVoidTy(EVLPart->getContext()), |
| 9472 | + Intrinsic::vp_scatter, |
| 9473 | + {StoredVal, Addr, Mask, EVLPart}); |
| 9474 | + } else { |
| 9475 | + VectorBuilder VBuilder(Builder); |
| 9476 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9477 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9478 | + Instruction::Store, Type::getVoidTy(EVLPart->getContext()), |
| 9479 | + {StoredVal, Addr})); |
| 9480 | + } |
| 9481 | + Call->addParamAttr( |
| 9482 | + 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9483 | + return Call; |
| 9484 | +} |
| 9485 | + |
| 9486 | +/// Creates either vp_load or vp_gather intrinsics calls to represent |
| 9487 | +/// predicated load/gather. |
| 9488 | +static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, |
| 9489 | + VectorType *DataTy, |
| 9490 | + Value *Addr, bool IsGather, |
| 9491 | + Value *Mask, Value *EVLPart, |
| 9492 | + const Align &Alignment) { |
| 9493 | + CallInst *Call; |
| 9494 | + if (IsGather) { |
| 9495 | + Call = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, |
| 9496 | + {Addr, Mask, EVLPart}, nullptr, |
| 9497 | + "wide.masked.gather"); |
| 9498 | + } else { |
| 9499 | + VectorBuilder VBuilder(Builder); |
| 9500 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9501 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9502 | + Instruction::Load, DataTy, Addr, "vp.op.load")); |
| 9503 | + } |
| 9504 | + Call->addParamAttr( |
| 9505 | + 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9506 | + return Call; |
| 9507 | +} |
| 9508 | + |
9402 | 9509 | void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9403 | 9510 | VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
|
9404 | 9511 |
|
@@ -9430,14 +9537,31 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9430 | 9537 | }
|
9431 | 9538 | }
|
9432 | 9539 |
|
| 9540 | + auto MaskValue = [&](unsigned Part) -> Value * { |
| 9541 | + if (isMaskRequired) |
| 9542 | + return BlockInMaskParts[Part]; |
| 9543 | + return nullptr; |
| 9544 | + }; |
| 9545 | + |
9433 | 9546 | // Handle Stores:
|
9434 | 9547 | if (SI) {
|
9435 | 9548 | State.setDebugLocFrom(SI->getDebugLoc());
|
9436 | 9549 |
|
9437 | 9550 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9438 | 9551 | Instruction *NewSI = nullptr;
|
9439 | 9552 | Value *StoredVal = State.get(StoredValue, Part);
|
9440 |
| - if (CreateGatherScatter) { |
| 9553 | + if (State.EVL) { |
| 9554 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9555 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9556 | + // creation, possibly default value = whole vector register length. EVL |
| 9557 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9558 | + // is not nullptr it also implies preference for predicated |
| 9559 | + // vectorization. |
| 9560 | + // FIXME: Support reverse store after vp_reverse is added. |
| 9561 | + NewSI = lowerStoreUsingVectorIntrinsics( |
| 9562 | + Builder, State.get(getAddr(), Part), StoredVal, CreateGatherScatter, |
| 9563 | + MaskValue(Part), EVLPart, Alignment); |
| 9564 | + } else if (CreateGatherScatter) { |
9441 | 9565 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9442 | 9566 | Value *VectorGep = State.get(getAddr(), Part);
|
9443 | 9567 | NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
|
@@ -9467,7 +9591,18 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9467 | 9591 | State.setDebugLocFrom(LI->getDebugLoc());
|
9468 | 9592 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9469 | 9593 | Value *NewLI;
|
9470 |
| - if (CreateGatherScatter) { |
| 9594 | + if (State.EVL) { |
| 9595 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9596 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9597 | + // creation, possibly default value = whole vector register length. EVL |
| 9598 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9599 | + // is not nullptr it also implies preference for predicated |
| 9600 | + // vectorization. |
| 9601 | + // FIXME: Support reverse loading after vp_reverse is added. |
| 9602 | + NewLI = lowerLoadUsingVectorIntrinsics( |
| 9603 | + Builder, DataTy, State.get(getAddr(), Part), CreateGatherScatter, |
| 9604 | + MaskValue(Part), EVLPart, Alignment); |
| 9605 | + } else if (CreateGatherScatter) { |
9471 | 9606 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9472 | 9607 | Value *VectorGep = State.get(getAddr(), Part);
|
9473 | 9608 | NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
|
|
0 commit comments