From 18ea5709eceeff6062b5223abd831e78fcb78dc4 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Mon, 31 Mar 2025 23:54:09 -0700 Subject: [PATCH 1/4] [LV][NFC] Clean up tail-folding check for early-exit loops This patch moves the check for a single latch exit from computeMaxVF() to LoopVectorizationLegality::canFoldTailByMasking(), as it duplicates the logic when foldTailByMasking() returns false. It also introduces HasSingleLatchExit to prevent early-exit loops from entering code paths that assume non-predicated loops. --- .../Vectorize/LoopVectorizationLegality.cpp | 8 +++++++ .../Transforms/Vectorize/LoopVectorize.cpp | 21 +++---------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 3ec6850d6f685..0dc065333f807 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1924,6 +1924,14 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { } } + // The only loops we can vectorize without a scalar epilogue, are loops with + // a bottom-test and a single exiting block. We'd have to handle the fact + // that not every instruction executes on the last iteration. This will + // require a lane mask which varies through the vector loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); + return false; + } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); return true; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 55cc801e91452..477514d907201 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3987,22 +3987,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { break; } - // The only loops we can vectorize without a scalar epilogue, are loops with - // a bottom-test and a single exiting block. We'd have to handle the fact - // that not every instruction executes on the last iteration. This will - // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(MaxTC, UserVF, false); - } - return FixedScalableVFPair::getNone(); - } - // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask @@ -4049,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return Rem->isZero(); }; - if (MaxPowerOf2RuntimeVF > 0u) { + bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch(); + if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) { assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) { @@ -4060,7 +4045,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop); - if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { + if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { if (MaxPowerOf2RuntimeVF > 0u) { // If we have a low-trip-count, and the fixed-width VF is known to divide // the trip count but the scalable factor does not, use the fixed-width From 45541a4937ae935bdcd2a523c482e5eb8d60b82b Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Tue, 1 Apr 2025 18:38:16 -0700 Subject: [PATCH 2/4] Style update --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 4 +++- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 0dc065333f807..0763a255b3afa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1929,7 +1929,9 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); return false; } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 477514d907201..a010f5c52e9a7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4033,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return Rem->isZero(); }; - bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch(); + bool HasSingleLatchExit = + TheLoop->getExitingBlock() == TheLoop->getLoopLatch(); if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) { assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); @@ -4045,7 +4046,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop); - if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { + if (HasSingleLatchExit && ExpectedTC && + ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { if (MaxPowerOf2RuntimeVF > 0u) { // If we have a low-trip-count, and the fixed-width VF is known to divide // the trip count but the scalable factor does not, use the fixed-width From e9df81f6a588211931d55f600372910d79d08e55 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Thu, 3 Apr 2025 07:15:20 -0700 Subject: [PATCH 3/4] Moves check to NoScalarEpilogueNeeded --- .../Vectorize/LoopVectorizationLegality.cpp | 20 +++++++++---------- .../Transforms/Vectorize/LoopVectorize.cpp | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 0763a255b3afa..8e09e6f8d4935 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1872,6 +1872,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } bool LoopVectorizationLegality::canFoldTailByMasking() const { + // The only loops we can vectorize without a scalar epilogue, are loops with + // a bottom-test and a single exiting block. We'd have to handle the fact + // that not every instruction executes on the last iteration. This will + // require a lane mask which varies through the vector loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); + return false; + } LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); @@ -1924,16 +1934,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { } } - // The only loops we can vectorize without a scalar epilogue, are loops with - // a bottom-test and a single exiting block. We'd have to handle the fact - // that not every instruction executes on the last iteration. This will - // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - LLVM_DEBUG( - dbgs() - << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); - return false; - } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); return true; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a010f5c52e9a7..01254b51a94e8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4016,14 +4016,17 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) { + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && + !Legal->hasUncountableEarlyExit()) + return false; unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; ScalarEvolution *SE = PSE.getSE(); - // Currently only loops with countable exits are vectorized, but calling - // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with - // uncountable exits whilst also ensuring the symbolic maximum and known - // back-edge taken count remain identical for loops with countable exits. + // Calling getSymbolicMaxBackedgeTakenCount enables support for loops + // with uncountable exits. For countable loops, the symbolic maximum must + // remain identical to the known back-edge taken count. const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount(); - assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() && + assert((Legal->hasUncountableEarlyExit() || + BackedgeTakenCount == PSE.getBackedgeTakenCount()) && "Invalid loop count"); const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); @@ -4033,9 +4036,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return Rem->isZero(); }; - bool HasSingleLatchExit = - TheLoop->getExitingBlock() == TheLoop->getLoopLatch(); - if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) { + if (MaxPowerOf2RuntimeVF > 0u) { assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) { @@ -4046,8 +4047,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop); - if (HasSingleLatchExit && ExpectedTC && - ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { + if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) { if (MaxPowerOf2RuntimeVF > 0u) { // If we have a low-trip-count, and the fixed-width VF is known to divide // the trip count but the scalable factor does not, use the fixed-width From 15f48eeb89ad78bf791578df9db387ce37e21d13 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Mon, 7 Apr 2025 18:28:20 -0700 Subject: [PATCH 4/4] Clarify check in NoScalarEpilogueNeeded --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 01254b51a94e8..a0f239f00f106 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4016,6 +4016,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) { + // Return false if the loop is neither a single-latch-exit loop nor an + // early-exit loop as tail-folding is not supported in that case. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && !Legal->hasUncountableEarlyExit()) return false;