-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[VPlan] Unroll VPReplicateRecipe by VF. #142433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c354bdc
2e877b5
0800450
884b9d3
ed982d4
14e296c
48699d9
cc1a779
47b9665
af2d2c0
5a73ebe
ab6665c
b6a0834
ae3e3c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -261,6 +261,13 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { | |
return Data.VPV2Scalars[Def][0]; | ||
} | ||
|
||
// Look through BuildVector to avoid redundant extracts. | ||
// TODO: Remove once replicate regions are unrolled explicitly. | ||
if (Lane.getKind() == VPLane::Kind::First && match(Def, m_BuildVector())) { | ||
auto *BuildVector = cast<VPInstruction>(Def); | ||
return get(BuildVector->getOperand(Lane.getKnownLane()), true); | ||
} | ||
|
||
assert(hasVectorValue(Def)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Independent: missing error message. |
||
auto *VecPart = Data.VPV2Vector[Def]; | ||
if (!VecPart->getType()->isVectorTy()) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -936,6 +936,13 @@ class VPInstruction : public VPRecipeWithIRFlags, | |
BranchOnCount, | ||
BranchOnCond, | ||
Broadcast, | ||
/// Given operands of (the same) struct type, creates a struct of fixed- | ||
/// width vectors each containing a struct field of all operands. The | ||
/// number of operands matches the element count of every vector. | ||
BuildStructVector, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lex order has BuildVector after BuildStructVector? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reordered, thanks |
||
/// Creates a fixed-width vector containing all operands. The number of | ||
/// operands matches the vector element count. | ||
BuildVector, | ||
ComputeAnyOfResult, | ||
ComputeFindLastIVResult, | ||
ComputeReductionResult, | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -221,6 +221,13 @@ struct Recipe_match { | |||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
bool match(const VPRecipeBase *R) const { | ||||||||||||||||||
if (std::tuple_size<Ops_t>::value == 0) { | ||||||||||||||||||
assert(Opcode == VPInstruction::BuildVector && | ||||||||||||||||||
"can only match BuildVector with empty ops"); | ||||||||||||||||||
auto *VPI = dyn_cast<VPInstruction>(R); | ||||||||||||||||||
return VPI && VPI->getOpcode() == VPInstruction::BuildVector; | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...)) | ||||||||||||||||||
return false; | ||||||||||||||||||
|
||||||||||||||||||
|
@@ -263,6 +270,10 @@ struct Recipe_match { | |||||||||||||||||
} | ||||||||||||||||||
}; | ||||||||||||||||||
|
||||||||||||||||||
template <unsigned Opcode, typename... RecipeTys> | ||||||||||||||||||
using ZeroOpRecipe_match = | ||||||||||||||||||
Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>; | ||||||||||||||||||
|
||||||||||||||||||
template <typename Op0_t, unsigned Opcode, typename... RecipeTys> | ||||||||||||||||||
using UnaryRecipe_match = | ||||||||||||||||||
Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>; | ||||||||||||||||||
|
@@ -271,6 +282,9 @@ template <typename Op0_t, unsigned Opcode> | |||||||||||||||||
using UnaryVPInstruction_match = | ||||||||||||||||||
UnaryRecipe_match<Op0_t, Opcode, VPInstruction>; | ||||||||||||||||||
|
||||||||||||||||||
template <unsigned Opcode> | ||||||||||||||||||
using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>; | ||||||||||||||||||
|
||||||||||||||||||
template <typename Op0_t, unsigned Opcode> | ||||||||||||||||||
using AllUnaryRecipe_match = | ||||||||||||||||||
UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe, | ||||||||||||||||||
|
@@ -302,6 +316,12 @@ using AllBinaryRecipe_match = | |||||||||||||||||
BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe, | ||||||||||||||||||
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>; | ||||||||||||||||||
|
||||||||||||||||||
/// BuildVector is matches only its opcode, w/o matching its operands as the | ||||||||||||||||||
/// number of operands is not fixed. | ||||||||||||||||||
inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { | ||||||||||||||||||
return ZeroOpVPInstruction_match<VPInstruction::BuildVector>(); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
Comment on lines
+321
to
+324
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
plus some explanation why - number of operands varies? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done thanks |
||||||||||||||||||
template <unsigned Opcode, typename Op0_t> | ||||||||||||||||||
inline UnaryVPInstruction_match<Op0_t, Opcode> | ||||||||||||||||||
m_VPInstruction(const Op0_t &Op0) { | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -551,6 +551,11 @@ Value *VPInstruction::generate(VPTransformState &State) { | |||||||
} | ||||||||
case Instruction::ExtractElement: { | ||||||||
assert(State.VF.isVector() && "Only extract elements from vectors"); | ||||||||
if (getOperand(1)->isLiveIn()) { | ||||||||
unsigned IdxToExtract = | ||||||||
cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue(); | ||||||||
return State.get(getOperand(0), VPLane(IdxToExtract)); | ||||||||
} | ||||||||
Value *Vec = State.get(getOperand(0)); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are lines 499-501 not deleted, given that we've already returned on line 496? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep removed, thanks |
||||||||
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); | ||||||||
return Builder.CreateExtractElement(Vec, Idx, Name); | ||||||||
Comment on lines
559
to
561
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
unreachable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed thanks |
||||||||
|
@@ -664,6 +669,34 @@ Value *VPInstruction::generate(VPTransformState &State) { | |||||||
return Builder.CreateVectorSplat( | ||||||||
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); | ||||||||
} | ||||||||
case VPInstruction::BuildStructVector: { | ||||||||
// For struct types, we need to build a new 'wide' struct type, where each | ||||||||
// element is widened, i.e., we create a struct of vectors. | ||||||||
auto *StructTy = | ||||||||
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0))); | ||||||||
Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); | ||||||||
for (const auto &[LaneIndex, Op] : enumerate(operands())) { | ||||||||
for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); | ||||||||
FieldIndex++) { | ||||||||
Value *ScalarValue = | ||||||||
Builder.CreateExtractValue(State.get(Op, true), FieldIndex); | ||||||||
Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex); | ||||||||
VectorValue = | ||||||||
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex); | ||||||||
Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex); | ||||||||
} | ||||||||
} | ||||||||
return Res; | ||||||||
} | ||||||||
case VPInstruction::BuildVector: { | ||||||||
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); | ||||||||
auto NumOfElements = ElementCount::getFixed(getNumOperands()); | ||||||||
Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); | ||||||||
for (const auto &[Idx, Op] : enumerate(operands())) | ||||||||
Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), | ||||||||
State.Builder.getInt32(Idx)); | ||||||||
return Res; | ||||||||
} | ||||||||
case VPInstruction::ReductionStartVector: { | ||||||||
if (State.VF.isScalar()) | ||||||||
return State.get(getOperand(0), true); | ||||||||
|
@@ -953,10 +986,11 @@ void VPInstruction::execute(VPTransformState &State) { | |||||||
if (!hasResult()) | ||||||||
return; | ||||||||
assert(GeneratedValue && "generate must produce a value"); | ||||||||
assert( | ||||||||
(GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || | ||||||||
State.VF.isScalar()) && | ||||||||
"scalar value but not only first lane defined"); | ||||||||
assert((((GeneratedValue->getType()->isVectorTy() || | ||||||||
GeneratedValue->getType()->isStructTy()) == | ||||||||
!GeneratesPerFirstLaneOnly) || | ||||||||
State.VF.isScalar()) && | ||||||||
"scalar value but not only first lane defined"); | ||||||||
State.set(this, GeneratedValue, | ||||||||
/*IsScalar*/ GeneratesPerFirstLaneOnly); | ||||||||
} | ||||||||
|
@@ -970,6 +1004,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { | |||||||
case Instruction::ICmp: | ||||||||
case Instruction::Select: | ||||||||
case VPInstruction::AnyOf: | ||||||||
case VPInstruction::BuildStructVector: | ||||||||
case VPInstruction::BuildVector: | ||||||||
case VPInstruction::CalculateTripCountMinusVF: | ||||||||
case VPInstruction::CanonicalIVIncrementForPart: | ||||||||
case VPInstruction::ExtractLastElement: | ||||||||
|
@@ -1092,6 +1128,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |||||||
case VPInstruction::Broadcast: | ||||||||
O << "broadcast"; | ||||||||
break; | ||||||||
case VPInstruction::BuildStructVector: | ||||||||
O << "buildstructvector"; | ||||||||
break; | ||||||||
case VPInstruction::BuildVector: | ||||||||
O << "buildvector"; | ||||||||
break; | ||||||||
case VPInstruction::ExtractLastElement: | ||||||||
O << "extract-last-element"; | ||||||||
break; | ||||||||
|
@@ -2686,45 +2728,27 @@ static void scalarizeInstruction(const Instruction *Instr, | |||||||
|
||||||||
void VPReplicateRecipe::execute(VPTransformState &State) { | ||||||||
Instruction *UI = getUnderlyingInstr(); | ||||||||
if (State.Lane) { // Generate a single instance. | ||||||||
assert((State.VF.isScalar() || !isSingleScalar()) && | ||||||||
"uniform recipe shouldn't be predicated"); | ||||||||
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); | ||||||||
scalarizeInstruction(UI, this, *State.Lane, State); | ||||||||
// Insert scalar instance packing it into a vector. | ||||||||
if (State.VF.isVector() && shouldPack()) { | ||||||||
Value *WideValue; | ||||||||
// If we're constructing lane 0, initialize to start from poison. | ||||||||
if (State.Lane->isFirstLane()) { | ||||||||
assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); | ||||||||
WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); | ||||||||
} else { | ||||||||
WideValue = State.get(this); | ||||||||
} | ||||||||
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, | ||||||||
*State.Lane)); | ||||||||
} | ||||||||
return; | ||||||||
} | ||||||||
|
||||||||
if (IsSingleScalar) { | ||||||||
// Uniform within VL means we need to generate lane 0. | ||||||||
if (!State.Lane) { | ||||||||
assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " | ||||||||
"must have already been unrolled"); | ||||||||
scalarizeInstruction(UI, this, VPLane(0), State); | ||||||||
return; | ||||||||
} | ||||||||
Comment on lines
2735
to
2737
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handle simpler single-scalar case first and early-return, assert one of above two cases applies:
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done thanks |
||||||||
|
||||||||
// A store of a loop varying value to a uniform address only needs the last | ||||||||
// copy of the store. | ||||||||
if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) { | ||||||||
auto Lane = VPLane::getLastLaneForVF(State.VF); | ||||||||
scalarizeInstruction(UI, this, VPLane(Lane), State); | ||||||||
return; | ||||||||
assert((State.VF.isScalar() || !isSingleScalar()) && | ||||||||
"uniform recipe shouldn't be predicated"); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
retain the assert here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done thanks |
||||||||
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); | ||||||||
scalarizeInstruction(UI, this, *State.Lane, State); | ||||||||
// Insert scalar instance packing it into a vector. | ||||||||
if (State.VF.isVector() && shouldPack()) { | ||||||||
Value *WideValue = | ||||||||
State.Lane->isFirstLane() | ||||||||
? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) | ||||||||
: State.get(this); | ||||||||
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, | ||||||||
*State.Lane)); | ||||||||
} | ||||||||
|
||||||||
// Generate scalar instances for all VF lanes. | ||||||||
const unsigned EndLane = State.VF.getFixedValue(); | ||||||||
for (unsigned Lane = 0; Lane < EndLane; ++Lane) | ||||||||
scalarizeInstruction(UI, this, VPLane(Lane), State); | ||||||||
} | ||||||||
|
||||||||
bool VPReplicateRecipe::shouldPack() const { | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
#include "VPlan.h" | ||
#include "VPlanAnalysis.h" | ||
#include "VPlanCFG.h" | ||
#include "VPlanHelpers.h" | ||
#include "VPlanPatternMatch.h" | ||
#include "VPlanTransforms.h" | ||
#include "VPlanUtils.h" | ||
|
@@ -450,3 +451,87 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { | |
|
||
VPlanTransforms::removeDeadRecipes(Plan); | ||
} | ||
|
||
/// Create a single-scalar clone of \p RepR for lane \p Lane. | ||
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, | ||
Type *IdxTy, VPReplicateRecipe *RepR, | ||
VPLane Lane) { | ||
// Collect the operands at Lane, creating extracts as needed. | ||
SmallVector<VPValue *> NewOps; | ||
for (VPValue *Op : RepR->operands()) { | ||
if (vputils::isSingleScalar(Op)) { | ||
NewOps.push_back(Op); | ||
continue; | ||
} | ||
if (Lane.getKind() == VPLane::Kind::ScalableLast) { | ||
NewOps.push_back( | ||
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); | ||
continue; | ||
} | ||
// Look through buildvector to avoid unnecessary extracts. | ||
if (match(Op, m_BuildVector())) { | ||
NewOps.push_back( | ||
cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); | ||
continue; | ||
} | ||
VPValue *Idx = | ||
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); | ||
VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); | ||
NewOps.push_back(Ext); | ||
} | ||
|
||
auto *New = | ||
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this be a VPInstruction? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); | ||
New->insertBefore(RepR); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That this failed to call transferFlags here, which resulted in miscompiles. I've created #147398 to fix. |
||
return New; | ||
} | ||
|
||
void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { | ||
Type *IdxTy = IntegerType::get( | ||
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32); | ||
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( | ||
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { | ||
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { | ||
auto *RepR = dyn_cast<VPReplicateRecipe>(&R); | ||
if (!RepR || RepR->isSingleScalar()) | ||
continue; | ||
|
||
VPBuilder Builder(RepR); | ||
if (RepR->getNumUsers() == 0) { | ||
if (isa<StoreInst>(RepR->getUnderlyingInstr()) && | ||
vputils::isSingleScalar(RepR->getOperand(1))) { | ||
// Stores to invariant addresses need to store the last lane only. | ||
cloneForLane(Plan, Builder, IdxTy, RepR, | ||
VPLane::getLastLaneForVF(VF)); | ||
} else { | ||
// Create single-scalar version of RepR for all lanes. | ||
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | ||
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); | ||
} | ||
RepR->eraseFromParent(); | ||
continue; | ||
} | ||
/// Create single-scalar version of RepR for all lanes. | ||
SmallVector<VPValue *> LaneDefs; | ||
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | ||
LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); | ||
|
||
/// Users that only demand the first lane can use the definition for lane | ||
/// 0. | ||
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { | ||
return U.onlyFirstLaneUsed(RepR); | ||
}); | ||
|
||
// If needed, create a Build(Struct)Vector recipe to insert the scalar | ||
// lane values into a vector. | ||
Comment on lines
+526
to
+527
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So a pair of replicating recipes one feeding the other is replaced by VF recipes feeding a buildVector which VF other recipes extract from, where the extracts are optimized away by cloneForLane(); and the buildVector possibly by dce? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep |
||
Type *ResTy = RepR->getUnderlyingInstr()->getType(); | ||
VPValue *VecRes = Builder.createNaryOp( | ||
ResTy->isStructTy() ? VPInstruction::BuildStructVector | ||
: VPInstruction::BuildVector, | ||
LaneDefs); | ||
RepR->replaceAllUsesWith(VecRes); | ||
RepR->eraseFromParent(); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is done now rather than later to reduce test diff?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep without the fold we would have additional insert/extracts for uses inside replicate regions, with corresponding test changes.