Skip to content

Commit 0b07987

Browse files
committed
[VPlan] Simplify VPBlendRecipes to select instructions
When looking at some EVL tail folded code in SPEC CPU 2017 I noticed we sometimes have both VPBlendRecipes and select VPInstructions in the same plan: EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3> EMIT vp<%7> = icmp ... EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7> BLEND ir<%8> = ir<%n.015> ir<%foo>/vp<%8> EMIT vp<%9> = select vp<%active.lane.mask>, ir<%8>, ir<%n.015> Since a blend will ultimately generate a chain of selects, we could fold the blend into the select: EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3> EMIT vp<%7> = icmp ... EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7> EMIT ir<%8> = select vp<%8>, ir<%foo>, ir<%n.015> So this patch canonicalizes blends to a series of select instructions, which allows them to be simplified further with other select instructions. The `BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask` optimisation has also been converted to operate on selects.
1 parent 1fe993c commit 0b07987

14 files changed

+73
-176
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,10 +2297,6 @@ class VPBlendRecipe : public VPSingleDefRecipe {
22972297
/// Generate the phi/select nodes.
22982298
void execute(VPTransformState &State) override;
22992299

2300-
/// Return the cost of this VPWidenMemoryRecipe.
2301-
InstructionCost computeCost(ElementCount VF,
2302-
VPCostContext &Ctx) const override;
2303-
23042300
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
23052301
/// Print the recipe.
23062302
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 14 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
909909
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
910910
Ctx.CostKind);
911911
}
912+
case Instruction::Select: {
913+
if (!getUnderlyingValue())
914+
return 0;
915+
// Handle cases where only the first lane is used the same way as the legacy
916+
// cost model.
917+
if (vputils::onlyFirstLaneUsed(this))
918+
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
919+
Type *ResTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
920+
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
921+
return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResTy, CmpTy,
922+
CmpInst::BAD_ICMP_PREDICATE,
923+
Ctx.CostKind);
924+
}
912925
case VPInstruction::AnyOf: {
913926
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
914927
return Ctx.TTI.getArithmeticReductionCost(
@@ -2380,53 +2393,7 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
23802393
#endif
23812394

23822395
void VPBlendRecipe::execute(VPTransformState &State) {
2383-
assert(isNormalized() && "Expected blend to be normalized!");
2384-
// We know that all PHIs in non-header blocks are converted into
2385-
// selects, so we don't have to worry about the insertion order and we
2386-
// can just use the builder.
2387-
// At this point we generate the predication tree. There may be
2388-
// duplications since this is a simple recursive scan, but future
2389-
// optimizations will clean it up.
2390-
2391-
unsigned NumIncoming = getNumIncomingValues();
2392-
2393-
// Generate a sequence of selects of the form:
2394-
// SELECT(Mask3, In3,
2395-
// SELECT(Mask2, In2,
2396-
// SELECT(Mask1, In1,
2397-
// In0)))
2398-
// Note that Mask0 is never used: lanes for which no path reaches this phi and
2399-
// are essentially undef are taken from In0.
2400-
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2401-
Value *Result = nullptr;
2402-
for (unsigned In = 0; In < NumIncoming; ++In) {
2403-
// We might have single edge PHIs (blocks) - use an identity
2404-
// 'select' for the first PHI operand.
2405-
Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2406-
if (In == 0)
2407-
Result = In0; // Initialize with the first incoming value.
2408-
else {
2409-
// Select between the current value and the previous incoming edge
2410-
// based on the incoming mask.
2411-
Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2412-
Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2413-
}
2414-
}
2415-
State.set(this, Result, OnlyFirstLaneUsed);
2416-
}
2417-
2418-
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2419-
VPCostContext &Ctx) const {
2420-
// Handle cases where only the first lane is used the same way as the legacy
2421-
// cost model.
2422-
if (vputils::onlyFirstLaneUsed(this))
2423-
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2424-
2425-
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2426-
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2427-
return (getNumIncomingValues() - 1) *
2428-
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2429-
CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2396+
llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
24302397
}
24312398

24322399
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
10821082
if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
10831083
return Def->replaceAllUsesWith(X);
10841084

1085+
// select !c, x, y -> select c, y, x
1086+
VPValue *C;
1087+
if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1088+
Def->setOperand(0, C);
1089+
Def->setOperand(1, Y);
1090+
Def->setOperand(2, X);
1091+
return;
1092+
}
1093+
10851094
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
10861095
return Def->replaceAllUsesWith(A);
10871096

@@ -1288,38 +1297,17 @@ static void simplifyBlends(VPlan &Plan) {
12881297
}
12891298
}
12901299

1291-
SmallVector<VPValue *, 4> OperandsWithMask;
1292-
OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1293-
1300+
VPBuilder Builder(&R);
1301+
VPValue *Select = Blend->getIncomingValue(StartIndex);
12941302
for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
12951303
if (I == StartIndex)
12961304
continue;
1297-
OperandsWithMask.push_back(Blend->getIncomingValue(I));
1298-
OperandsWithMask.push_back(Blend->getMask(I));
1299-
}
1300-
1301-
auto *NewBlend = new VPBlendRecipe(
1302-
cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
1303-
NewBlend->insertBefore(&R);
1304-
1305-
VPValue *DeadMask = Blend->getMask(StartIndex);
1306-
Blend->replaceAllUsesWith(NewBlend);
1307-
Blend->eraseFromParent();
1308-
recursivelyDeleteDeadRecipes(DeadMask);
1309-
1310-
/// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1311-
VPValue *NewMask;
1312-
if (NewBlend->getNumOperands() == 3 &&
1313-
match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1314-
VPValue *Inc0 = NewBlend->getOperand(0);
1315-
VPValue *Inc1 = NewBlend->getOperand(1);
1316-
VPValue *OldMask = NewBlend->getOperand(2);
1317-
NewBlend->setOperand(0, Inc1);
1318-
NewBlend->setOperand(1, Inc0);
1319-
NewBlend->setOperand(2, NewMask);
1320-
if (OldMask->getNumUsers() == 0)
1321-
cast<VPInstruction>(OldMask)->eraseFromParent();
1305+
Select =
1306+
Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I),
1307+
Select, R.getDebugLoc(), "predphi");
1308+
Select->setUnderlyingValue(Blend->getUnderlyingValue());
13221309
}
1310+
Blend->replaceAllUsesWith(Select);
13231311
}
13241312
}
13251313
}

llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
2222
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
2323
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
2424
; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
25-
; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
26-
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
25+
; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ule <2 x double> [[TMP2]], zeroinitializer
26+
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
2727
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
2828
; TFNONE-NEXT: store double [[TMP14]], ptr [[P:%.*]], align 8
2929
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -935,8 +935,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
935935
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
936936
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
937937
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
938-
; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
939-
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
938+
; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ule <vscale x 2 x double> [[TMP8]], zeroinitializer
939+
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> zeroinitializer
940940
; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
941941
; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
942942
; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1

llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,8 +597,8 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
597597
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
598598
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
599599
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
600-
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
601-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i16> splat (i16 99), <vscale x 8 x i16> [[WIDE_LOAD]]
600+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
601+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i16> [[WIDE_LOAD]], <vscale x 8 x i16> splat (i16 99)
602602
; CHECK-NEXT: store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP11]], align 2
603603
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]]
604604
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -677,9 +677,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
677677
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
678678
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
679679
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
680-
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
680+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
681681
; CHECK-NEXT: [[TMP10:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
682-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
682+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[TMP10]]
683683
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
684684
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
685685
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -720,12 +720,12 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
720720
; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
721721
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
722722
; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
723-
; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
724-
; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
723+
; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
724+
; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
725725
; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], splat (i64 27)
726726
; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
727-
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
728-
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
727+
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]]
728+
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]]
729729
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
730730
; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
731731
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -797,9 +797,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
797797
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
798798
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
799799
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
800-
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
800+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
801801
; CHECK-NEXT: [[TMP10:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
802-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
802+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[TMP10]]
803803
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
804804
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
805805
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -840,12 +840,12 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
840840
; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
841841
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
842842
; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
843-
; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
844-
; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
843+
; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
844+
; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
845845
; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], splat (i64 27)
846846
; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
847-
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
848-
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
847+
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]]
848+
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]]
849849
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
850850
; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
851851
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -398,9 +398,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
398398
; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
399399
; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
400400
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
401-
; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
401+
; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp sle <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
402402
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
403-
; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
403+
; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP16]]
404404
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
405405
; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
406406
; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -956,9 +956,9 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
956956
; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
957957
; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
958958
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
959-
; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
959+
; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp sle <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
960960
; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
961-
; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP28]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[VEC_PHI]]
961+
; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP22]]
962962
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
963963
; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
964964
; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ define i32 @test_explicit_pred(i64 %len) {
3434
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
3535
; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
3636
; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], splat (i64 4)
37-
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
38-
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
39-
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
40-
; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
37+
; CHECK-NEXT: [[TMP0:%.*]] = icmp sge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
38+
; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
39+
; CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
40+
; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
4141
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[INDEX]]
4242
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
4343
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
@@ -47,10 +47,10 @@ define i32 @test_explicit_pred(i64 %len) {
4747
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
4848
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
4949
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
50-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
51-
; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer
52-
; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer
53-
; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer
50+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD]]
51+
; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD7]]
52+
; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD8]]
53+
; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD9]]
5454
; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
5555
; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]]
5656
; CHECK-NEXT: [[TMP18]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]]

0 commit comments

Comments
 (0)