Skip to content

[VPlan] Model address separately. #72164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 13 additions & 46 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8174,13 +8174,20 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;

VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
Reverse, I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
Consecutive, Reverse);
return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
Reverse);

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
Mask, Consecutive, Reverse);
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
Consecutive, Reverse);
}

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
Expand Down Expand Up @@ -9485,44 +9492,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: "only neede[d] for" typo above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 516cc98

}

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr = nullptr;

// Use i32 for the gep index type when the value is constant,
// or query DataLayout for a more suitable index type otherwise.
const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
? DL.getIndexType(PointerType::getUnqual(
ScalarDataTy->getContext()))
: Builder.getInt32Ty();
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();
if (isReverse()) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
// RunTimeVF = VScale * VF.getKnownMinValue()
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
// NumElt = -Part * RunTimeVF
Value *NumElt =
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
// LastLane = 1 - RunTimeVF
Value *LastLane =
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
PartPtr =
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
} else {
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
}

return PartPtr;
};

// Handle Stores:
if (SI) {
State.setDebugLocFrom(SI->getDebugLoc());
Expand All @@ -9543,8 +9512,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
auto *VecPtr =
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
auto *VecPtr = State.get(getAddr(), Part);
if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
BlockInMaskParts[Part]);
Expand All @@ -9568,8 +9536,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
nullptr, "wide.masked.gather");
State.addMetadata(NewLI, LI);
} else {
auto *VecPtr =
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
auto *VecPtr = State.get(getAddr(), Part);
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,36 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
#endif
};

/// A recipe to compute the pointers for widened memory accesses of IndexTy for
/// all parts. If IsReverse is true, compute pointers for accessing the input in
/// reverse order per part.
class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
Type *IndexedTy;
bool IsReverse;

public:
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
DebugLoc DL)
: VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this),
IndexedTy(IndexedTy), IsReverse(IsReverse) {}

VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)

void execute(VPTransformState &State) override;

bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
};

/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
Expand Down
53 changes: 53 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,59 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif

void VPVectorPointerRecipe ::execute(VPTransformState &State) {
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr = nullptr;
// Use i32 for the gep index type when the value is constant,
// or query DataLayout for a more suitable index type otherwise.
const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
? DL.getIndexType(IndexedTy->getPointerTo())
: Builder.getInt32Ty();
Comment on lines +1218 to +1224
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The recipe holds IndexTy to avoid the need to look it up.
Note that DL is overloaded - as a member of recipe it stands for Debug Location.

Suggested change
// Use i32 for the gep index type when the value is constant,
// or query DataLayout for a more suitable index type otherwise.
const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
? DL.getIndexType(IndexedTy->getPointerTo())
: Builder.getInt32Ty();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The recipe stores the indexed type, which is different to the index type (integer type used for the indices of the generated GEP). Left as is for now.

Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
bool InBounds = false;
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit (unrelated to this patch): should this if be an assert, must consecutive Ptr's be GEP's, do we risk losing inbound information by retrieving it at code-gen rather than recording it when introducing the recipe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, the inbounds info should be recorded in the recipe, which I'll do as follow-up.

InBounds = GEP->isInBounds();
if (IsReverse) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
// RunTimeVF = VScale * VF.getKnownMinValue()
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
// NumElt = -Part * RunTimeVF
Value *NumElt = Builder.CreateMul(
ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
// LastLane = 1 - RunTimeVF
Value *LastLane =
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
} else {
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
}

State.set(this, PartPtr, Part);
}
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = vector-pointer ";
if (IsReverse)
O << "(reverse) ";

printOperands(O, SlotTracker);
}
#endif

void VPBlendRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
// We know that all PHIs in non-header blocks are converted into
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ class VPDef {
VPReductionSC,
VPReplicateSC,
VPScalarIVStepsSC,
VPVectorPointerSC,
VPWidenCallSC,
VPWidenCanonicalIVSC,
VPWidenCastSC,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8>
; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
Expand Down Expand Up @@ -459,8 +459,8 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
Expand Down
Loading