Skip to content

Commit 16968a0

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.4
2 parents 29fd3e2 + d6bceb5 commit 16968a0

File tree

59 files changed

+699
-629
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+699
-629
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 22 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8231,13 +8231,24 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
82318231
bool Consecutive =
82328232
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
82338233

8234+
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8235+
if (Decision != LoopVectorizationCostModel::CM_GatherScatter &&
8236+
Decision != LoopVectorizationCostModel::CM_Interleave) {
8237+
auto *VectorPtr = Reverse
8238+
? new VPInstruction(VPInstruction::CreateVectorPtr,
8239+
{Ptr, Ptr}, I->getDebugLoc())
8240+
: new VPInstruction(VPInstruction::CreateVectorPtr,
8241+
{Ptr}, I->getDebugLoc());
8242+
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8243+
Ptr = VectorPtr;
8244+
}
82348245
if (LoadInst *Load = dyn_cast<LoadInst>(I))
8235-
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8236-
Consecutive, Reverse);
8246+
return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8247+
Reverse);
82378248

82388249
StoreInst *Store = cast<StoreInst>(I);
8239-
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8240-
Mask, Consecutive, Reverse);
8250+
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8251+
Consecutive, Reverse);
82418252
}
82428253

82438254
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9525,50 +9536,13 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95259536
InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
95269537
bool isMaskRequired = getMask();
95279538
if (isMaskRequired)
9528-
for (unsigned Part = 0; Part < State.UF; ++Part)
9529-
BlockInMaskParts[Part] = State.get(getMask(), Part);
9530-
9531-
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9532-
// Calculate the pointer for the specific unroll-part.
9533-
Value *PartPtr = nullptr;
9534-
9535-
// Use i32 for the gep index type when the value is constant,
9536-
// or query DataLayout for a more suitable index type otherwise.
9537-
const DataLayout &DL =
9538-
Builder.GetInsertBlock()->getModule()->getDataLayout();
9539-
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
9540-
? DL.getIndexType(PointerType::getUnqual(
9541-
ScalarDataTy->getContext()))
9542-
: Builder.getInt32Ty();
9543-
bool InBounds = false;
9544-
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9545-
InBounds = gep->isInBounds();
9546-
if (isReverse()) {
9547-
// If the address is consecutive but reversed, then the
9548-
// wide store needs to start at the last vector element.
9549-
// RunTimeVF = VScale * VF.getKnownMinValue()
9550-
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9551-
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
9552-
// NumElt = -Part * RunTimeVF
9553-
Value *NumElt =
9554-
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
9555-
// LastLane = 1 - RunTimeVF
9556-
Value *LastLane =
9557-
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
9558-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
9559-
PartPtr =
9560-
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
9561-
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9562-
BlockInMaskParts[Part] =
9563-
Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9564-
} else {
9565-
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
9566-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
9539+
for (unsigned Part = 0; Part < State.UF; ++Part) {
9540+
Value *Mask = State.get(getMask(), Part);
9541+
if (isReverse())
9542+
Mask = Builder.CreateVectorReverse(Mask, "reverse");
9543+
BlockInMaskParts[Part] = Mask;
95679544
}
95689545

9569-
return PartPtr;
9570-
};
9571-
95729546
// Handle Stores:
95739547
if (SI) {
95749548
State.setDebugLocFrom(SI->getDebugLoc());
@@ -9589,8 +9563,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95899563
// We don't want to update the value in the map as it might be used in
95909564
// another expression. So don't call resetVectorValue(StoredVal).
95919565
}
9592-
auto *VecPtr =
9593-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9566+
auto *VecPtr = State.get(getAddr(), Part);
95949567
if (isMaskRequired)
95959568
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
95969569
BlockInMaskParts[Part]);
@@ -9614,8 +9587,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
96149587
nullptr, "wide.masked.gather");
96159588
State.addMetadata(NewLI, LI);
96169589
} else {
9617-
auto *VecPtr =
9618-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9590+
auto *VecPtr = State.get(getAddr(), Part);
96199591
if (isMaskRequired)
96209592
NewLI = Builder.CreateMaskedLoad(
96219593
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
10381038
// canonical IV separately for each unrolled part.
10391039
CanonicalIVIncrementForPart,
10401040
BranchOnCount,
1041-
BranchOnCond
1041+
BranchOnCond,
1042+
CreateVectorPtr
10421043
};
10431044

10441045
private:
@@ -1146,6 +1147,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
11461147
case VPInstruction::CanonicalIVIncrement:
11471148
case VPInstruction::CanonicalIVIncrementForPart:
11481149
case VPInstruction::BranchOnCount:
1150+
case VPInstruction::CreateVectorPtr:
11491151
return true;
11501152
};
11511153
llvm_unreachable("switch should return");

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
122122
case VPInstruction::CalculateTripCountMinusVF:
123123
case VPInstruction::CanonicalIVIncrement:
124124
case VPInstruction::CanonicalIVIncrementForPart:
125+
case VPInstruction::CreateVectorPtr:
125126
return false;
126127
default:
127128
return true;
@@ -404,6 +405,49 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
404405
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
405406
return CondBr;
406407
}
408+
case VPInstruction::CreateVectorPtr: {
409+
// Calculate the pointer for the specific unroll-part.
410+
Value *PartPtr = nullptr;
411+
bool IsReverse = getNumOperands() > 1;
412+
auto *MemR = cast<VPWidenMemoryInstructionRecipe>(*user_begin());
413+
Type *ScalarDataTy =
414+
MemR->isStore() ? cast<StoreInst>(&MemR->getIngredient())
415+
->getValueOperand()
416+
->getType()
417+
: cast<LoadInst>(&MemR->getIngredient())->getType();
418+
// Use i32 for the gep index type when the value is constant,
419+
// or query DataLayout for a more suitable index type otherwise.
420+
const DataLayout &DL =
421+
Builder.GetInsertBlock()->getModule()->getDataLayout();
422+
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
423+
? DL.getIndexType(ScalarDataTy->getPointerTo())
424+
: Builder.getInt32Ty();
425+
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
426+
bool InBounds = false;
427+
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
428+
InBounds = gep->isInBounds();
429+
if (IsReverse) {
430+
// If the address is consecutive but reversed, then the
431+
// wide store needs to start at the last vector element.
432+
// RunTimeVF = VScale * VF.getKnownMinValue()
433+
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
434+
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
435+
// NumElt = -Part * RunTimeVF
436+
Value *NumElt =
437+
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
438+
// LastLane = 1 - RunTimeVF
439+
Value *LastLane =
440+
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
441+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
442+
PartPtr =
443+
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
444+
} else {
445+
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
446+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
447+
}
448+
449+
return PartPtr;
450+
}
407451
default:
408452
llvm_unreachable("Unsupported opcode for instruction");
409453
}
@@ -483,6 +527,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
483527
case VPInstruction::BranchOnCount:
484528
O << "branch-on-count";
485529
break;
530+
case VPInstruction::CreateVectorPtr:
531+
O << "create-vector-pointer";
532+
break;
486533
default:
487534
O << Instruction::getOpcodeName(getOpcode());
488535
}

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
179179
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8>
180180
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64
181181
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]]
182-
; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
183182
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16
183+
; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
184184
; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1
185185
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
186186
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -193,18 +193,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
193193
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0
194194
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
195195
; CHECK: vec.epilog.vector.body:
196-
; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
196+
; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
197197
; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16
198198
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0
199199
; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]]
200200
; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
201201
; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8>
202202
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer
203-
; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64
203+
; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX3]] to i64
204204
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]]
205205
; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1
206-
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8
207-
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000
206+
; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 8
207+
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 1000
208208
; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
209209
; CHECK: vec.epilog.middle.block:
210210
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -268,19 +268,19 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
268268
; CHECK: vec.epilog.ph:
269269
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
270270
; CHECK: vec.epilog.vector.body:
271-
; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
271+
; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
272272
; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[A]] to i16
273273
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i64 0
274274
; CHECK-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP11]], <i16 99, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>
275275
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> zeroinitializer
276276
; CHECK-NEXT: [[TMP14:%.*]] = lshr <8 x i16> [[TMP13]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
277277
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], <8 x i16> [[TMP14]], <8 x i16> [[TMP13]]
278278
; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i16> [[TMP15]] to <8 x i8>
279-
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX2]] to i64
279+
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX1]] to i64
280280
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP17]]
281281
; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1
282-
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 8
283-
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 1000
282+
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 8
283+
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 1000
284284
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
285285
; CHECK: vec.epilog.middle.block:
286286
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]

0 commit comments

Comments
 (0)