From 512aa0d5e7bfd0ca2381a0675fa29384fa650faf Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Oct 2024 19:40:51 +0100 Subject: [PATCH 1/7] [LV] Use SCEV to check if minimum iteration check is known. Use SCEV to check if the minimum iteration check (TC < Step) is known to be false. This is a first step towards addressing https://github.com/llvm/llvm-project/issues/111098. To catch the exact case from the issue, we need to do extra work to make sure the wrap flags on the shl are preserved and used by SCEV. --- .../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++--- .../AArch64/eliminate-tail-predication.ll | 3 +- .../LoopVectorize/AArch64/masked-call.ll | 44 ++++--------------- .../AArch64/pr60831-sve-inv-store-crash.ll | 3 +- .../LoopVectorize/AArch64/sve-tail-folding.ll | 3 +- .../AArch64/wider-VF-for-callinst.ll | 5 +-- .../Transforms/LoopVectorize/if-reduction.ll | 3 +- .../version-stride-with-integer-casts.ll | 3 +- 8 files changed, 30 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 35c042b3ab7fc..c349fa65343c4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2438,12 +2438,21 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { }; TailFoldingStyle Style = Cost->getTailFoldingStyle(); - if (Style == TailFoldingStyle::None) - CheckMinIters = - Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); - else if (VF.isScalable() && - !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && - Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + if (Style == TailFoldingStyle::None) { + Value *Step = CreateStep(); + ScalarEvolution &SE = *PSE.getSE(); + // Check if we can prove that the trip count is >= the step. + const SCEV *TripCountSCEV = SE.getTripCountFromExitCount( + PSE.getBackedgeTakenCount(), CountTy, OrigLoop); + if (SE.isKnownPredicate(CmpInst::getInversePredicate(P), + SE.applyLoopGuards(TripCountSCEV, OrigLoop), + SE.getSCEV(Step))) + CheckMinIters = Builder.getFalse(); + else + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } else if (VF.isScalable() && + !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && + Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an // additional overflow check is required before entering the vector loop. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll index 8c50d86489c9d..7dcab6d807cf7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -11,8 +11,7 @@ define void @f1(ptr %A) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 93034f4dbe56e..5496eed16e544 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -11,10 +11,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -146,10 +143,7 @@ for.cond.cleanup: define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_if_then( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -310,10 +304,7 @@ for.cond.cleanup: define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_if_then_else( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -490,10 +481,7 @@ for.cond.cleanup: define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_nomask( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -548,11 +536,6 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; ; TFFALLBACK-LABEL: @test_widen_nomask( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; TFFALLBACK: vector.ph: ; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] @@ -561,7 +544,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: -; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; TFFALLBACK-NEXT: [[TMP7:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) @@ -569,12 +552,9 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TFFALLBACK: for.body: -; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] @@ -626,10 +606,7 @@ for.cond.cleanup: define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_optmask( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -791,10 +768,7 @@ for.cond.cleanup: define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, double %m) #4 { ; TFNONE-LABEL: @test_widen_fmuladd_and_call( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll index 0e95d742092e6..d18cdc1ae617a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll @@ -10,8 +10,7 @@ define void @test_invar_gep(ptr %dst) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 94b90aa3cfb30..1d150141e6251 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -757,8 +757,7 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 4a2f9d07ed91c..4a3bc4679bba4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -7,10 +7,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; WIDE-LABEL: @test_widen( ; WIDE-NEXT: entry: -; WIDE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; WIDE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; WIDE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; WIDE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; WIDE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; WIDE: vector.ph: ; WIDE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index 383b62b368ef0..5f6824a022d56 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1668,8 +1668,7 @@ define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly { ; CHECK: [[FOR_HEADER]]: ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 0, [[ZEXT]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index b3ec3e8f0f3c6..a85242874410a 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -423,8 +423,7 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[G_64:%.*]] = zext i1 [[G]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 15, [[G_64]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] From 86ea8c39a9424ee5c481502fae3ae6667136f398 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 9 Oct 2024 14:34:09 +0100 Subject: [PATCH 2/7] !fixup retrieve trip count SCEV from Count value directly. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e21cb447c3285..9c4c794736408 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2444,8 +2444,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Step = CreateStep(); ScalarEvolution &SE = *PSE.getSE(); // Check if we can prove that the trip count is >= the step. - const SCEV *TripCountSCEV = SE.getTripCountFromExitCount( - PSE.getBackedgeTakenCount(), CountTy, OrigLoop); + const SCEV *TripCountSCEV = SE.getSCEV(Count); if (SE.isKnownPredicate(CmpInst::getInversePredicate(P), SE.applyLoopGuards(TripCountSCEV, OrigLoop), SE.getSCEV(Step))) From 8a89ff3a476f819f8cb2d3bf760a9a4f45df97b7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 9 Oct 2024 20:36:38 +0100 Subject: [PATCH 3/7] !fixup address latest comments, thanks --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c4c794736408..48d2d53dbd664 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2444,6 +2444,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Step = CreateStep(); ScalarEvolution &SE = *PSE.getSE(); // Check if we can prove that the trip count is >= the step. + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. const SCEV *TripCountSCEV = SE.getSCEV(Count); if (SE.isKnownPredicate(CmpInst::getInversePredicate(P), SE.applyLoopGuards(TripCountSCEV, OrigLoop), From 5339f4db041ca77fda3d8271602668bbc7ef7c6d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 10 Oct 2024 12:53:51 +0100 Subject: [PATCH 4/7] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 69dbf3db4e82b..18c690fec892f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2443,16 +2443,22 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { if (Style == TailFoldingStyle::None) { Value *Step = CreateStep(); ScalarEvolution &SE = *PSE.getSE(); - // Check if we can prove that the trip count is >= the step. // TODO: Emit unconditional branch to vector preheader instead of // conditional branch with known condition. const SCEV *TripCountSCEV = SE.getSCEV(Count); - if (SE.isKnownPredicate(CmpInst::getInversePredicate(P), - SE.applyLoopGuards(TripCountSCEV, OrigLoop), - SE.getSCEV(Step))) - CheckMinIters = Builder.getFalse(); - else + // Check if the trip count is < the step. + if (SE.isKnownPredicate(P, SE.applyLoopGuards(TripCountSCEV, OrigLoop), + SE.getSCEV(Step))) { + // TODO: Should not attempt to vectorize when the vector loop is known to + // never execute. + CheckMinIters = Builder.getTrue(); + } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), + SE.applyLoopGuards(TripCountSCEV, OrigLoop), + SE.getSCEV(Step))) { + // Only generate the minimum iteration check only if we cannot prove the + // check is known to be false. CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } } else if (VF.isScalable() && !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { @@ -2465,8 +2471,17 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { ConstantInt::get(CountTy, cast(CountTy)->getMask()); Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); + Value *Step = CreateStep(); + ScalarEvolution &SE = *PSE.getSE(); + // Check if we can prove that the trip count is >= the step. + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. + const SCEV *TripCountSCEV = SE.getSCEV(LHS); + assert(!SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), + SE.applyLoopGuards(TripCountSCEV, OrigLoop), + SE.getSCEV(Step)) && "SCEV unexpectedly proved overflow check to be known); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } // Create new preheader for vector loop. From cacf91ba0e75aefed7b1297974ba7dc4224eac6b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 13 Oct 2024 21:28:21 +0100 Subject: [PATCH 5/7] !fixup addres comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 30 +++++++++---------- .../version-stride-with-integer-casts.ll | 3 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6b561e4fdc890..546a5d54814ad 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2444,19 +2444,20 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { ScalarEvolution &SE = *PSE.getSE(); // TODO: Emit unconditional branch to vector preheader instead of // conditional branch with known condition. - const SCEV *TripCountSCEV = SE.getSCEV(Count); + const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); // Check if the trip count is < the step. - if (SE.isKnownPredicate(P, SE.applyLoopGuards(TripCountSCEV, OrigLoop), - SE.getSCEV(Step))) { - // TODO: Should not attempt to vectorize when the vector loop is known to - // never execute. + if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { + // TODO: Ensure step is at most the trip count when determining max VF and + // UF, w/o tail folding. CheckMinIters = Builder.getTrue(); } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), - SE.applyLoopGuards(TripCountSCEV, OrigLoop), - SE.getSCEV(Step))) { - // Only generate the minimum iteration check only if we cannot prove the - // check is known to be false. + TripCountSCEV, SE.getSCEV(Step))) { + // Generate the minimum iteration check only if we cannot prove the + // check is known to be true, or known to be false CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + + // else step is known to be smaller than trip count, use CheckMinIters + // preset to false. } } else if (VF.isScalable() && !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && @@ -2473,12 +2474,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Step = CreateStep(); ScalarEvolution &SE = *PSE.getSE(); // Check if we can prove that the trip count is >= the step. - // TODO: Emit unconditional branch to vector preheader instead of - // conditional branch with known condition. - const SCEV *TripCountSCEV = SE.getSCEV(LHS); - assert(!SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), - SE.applyLoopGuards(TripCountSCEV, OrigLoop), - SE.getSCEV(Step)) && "SCEV unexpectedly proved overflow check to be known); + const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(LHS), OrigLoop); + assert( + !SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), + TripCountSCEV, SE.getSCEV(Step)) && + "SCEV unexpectedly proved overflow check to be known"); // Don't execute the vector loop if (UMax - n) < (VF * UF). CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index a85242874410a..60e1d340ee1c9 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -488,8 +488,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[G_64]] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] From 5e455de898272f9bd0c581cfb2a4b8d447304ae0 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 16 Oct 2024 08:20:12 +0100 Subject: [PATCH 6/7] !fixup wrap code only used in assertion in NDEBUG --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0c8a7ba9da66e..b4d1f07ed73a4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2471,13 +2471,14 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); Value *Step = CreateStep(); +#ifndef NDEBUG ScalarEvolution &SE = *PSE.getSE(); - // Check if we can prove that the trip count is >= the step. const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(LHS), OrigLoop); assert( !SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), TripCountSCEV, SE.getSCEV(Step)) && "SCEV unexpectedly proved overflow check to be known"); +#endif // Don't execute the vector loop if (UMax - n) < (VF * UF). CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } From 660fe447234514b7e69fd61c48b739b643f6cabb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 18 Oct 2024 13:28:36 -0700 Subject: [PATCH 7/7] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 ++++++-------- llvm/test/Transforms/LoopVectorize/if-reduction.ll | 1 + .../version-stride-with-integer-casts.ll | 1 + 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 18042a9b9e8a9..86b95cf41d18b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2449,12 +2449,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), TripCountSCEV, SE.getSCEV(Step))) { // Generate the minimum iteration check only if we cannot prove the - // check is known to be true, or known to be false + // check is known to be true, or known to be false. CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); - - // else step is known to be smaller than trip count, use CheckMinIters - // preset to false. - } + } // else step known to be < trip count, use CheckMinIters preset to false. } else if (VF.isScalable() && !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { @@ -2470,11 +2467,12 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Step = CreateStep(); #ifndef NDEBUG ScalarEvolution &SE = *PSE.getSE(); - const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(LHS), OrigLoop); + const SCEV *TC2OverflowSCEV = SE.applyLoopGuards(SE.getSCEV(LHS), OrigLoop); assert( + !isIndvarOverflowCheckKnownFalse(Cost, VF * UF) && !SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), - TripCountSCEV, SE.getSCEV(Step)) && - "SCEV unexpectedly proved overflow check to be known"); + TC2OverflowSCEV, SE.getSCEV(Step)) && + "unexpectedly proved overflow check to be known"); #endif // Don't execute the vector loop if (UMax - n) < (VF * UF). CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index 5f6824a022d56..330cdeaeb7c27 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1659,6 +1659,7 @@ for.end: ; preds = %for.body, %entry ret i64 %1 } +; FIXME: %indvars.iv.next is poison on first iteration due to sub nuw 0, 1. define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly { ; CHECK-LABEL: define i32 @fcmp_0_sub_select1( ; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 60e1d340ee1c9..5e65832aba8cc 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -415,6 +415,7 @@ exit: ; Test case to make sure that uses of versioned strides of type i1 are properly ; extended. From https://github.com/llvm/llvm-project/issues/91369. +; TODO: Better check (udiv i64 15, %g.64) after checking if %g == 1. define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-LABEL: define void @zext_of_i1_stride( ; CHECK-SAME: i1 [[G:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {