From ca30d43b1be110a9f16f1c48f9d885964937bfac Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Apr 2025 13:17:14 +0100 Subject: [PATCH 1/4] [VPlan] Split out VPBlendRecipe simplifications from simplifyRecipes. NFC This is split off from #133977 VPBlendRecipe normalisation is sensitive to the number of users a mask has, so should probably be run after the masks are simplified as much as possible. Note this could be run after removeDeadRecipes but this causes test diffs, some regressions, so this is left to a later patch. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 148 ++++++++++-------- .../Transforms/Vectorize/VPlanTransforms.h | 4 + 2 files changed, 84 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0c37db7f9d3a3..137ed330c31b2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -926,74 +926,6 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { using namespace llvm::VPlanPatternMatch; - if (auto *Blend = dyn_cast(&R)) { - // Try to remove redundant blend recipes. - SmallPtrSet UniqueValues; - if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) - UniqueValues.insert(Blend->getIncomingValue(0)); - for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (!match(Blend->getMask(I), m_False())) - UniqueValues.insert(Blend->getIncomingValue(I)); - - if (UniqueValues.size() == 1) { - Blend->replaceAllUsesWith(*UniqueValues.begin()); - Blend->eraseFromParent(); - return; - } - - if (Blend->isNormalized()) - return; - - // Normalize the blend so its first incoming value is used as the initial - // value with the others blended into it. - - unsigned StartIndex = 0; - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - // If a value's mask is used only by the blend then is can be deadcoded. - // TODO: Find the most expensive mask that can be deadcoded, or a mask - // that's used by multiple blends where it can be removed from them all. - VPValue *Mask = Blend->getMask(I); - if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { - StartIndex = I; - break; - } - } - - SmallVector OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - if (I == StartIndex) - continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - - /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. - VPValue *NewMask; - if (NewBlend->getNumOperands() == 3 && - match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { - VPValue *Inc0 = NewBlend->getOperand(0); - VPValue *Inc1 = NewBlend->getOperand(1); - VPValue *OldMask = NewBlend->getOperand(2); - NewBlend->setOperand(0, Inc1); - NewBlend->setOperand(1, Inc0); - NewBlend->setOperand(2, NewMask); - if (OldMask->getNumUsers() == 0) - cast(OldMask)->eraseFromParent(); - } - return; - } - // VPScalarIVSteps can only be simplified after unrolling. VPScalarIVSteps for // part 0 can be replaced by their start value, if only the first lane is // demanded. @@ -1092,6 +1024,85 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } } +void VPlanTransforms::simplifyBlends(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + SetVector Worklist; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *Blend = dyn_cast(&R); + if (!Blend) + continue; + + // Try to remove redundant blend recipes. + SmallPtrSet UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + continue; + } + + if (Blend->isNormalized()) + continue; + + // Normalize the blend so its first incoming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + + SmallVector OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + + /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. + VPValue *NewMask; + if (NewBlend->getNumOperands() == 3 && + match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { + VPValue *Inc0 = NewBlend->getOperand(0); + VPValue *Inc1 = NewBlend->getOperand(1); + VPValue *OldMask = NewBlend->getOperand(2); + NewBlend->setOperand(0, Inc1); + NewBlend->setOperand(1, Inc0); + NewBlend->setOperand(2, NewMask); + if (OldMask->getNumUsers() == 0) + cast(OldMask)->eraseFromParent(); + } + } + } +} + /// Optimize the width of vector induction variables in \p Plan based on a known /// constant Trip Count, \p BestVF and \p BestUF. static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, @@ -1687,6 +1698,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantInductionCasts, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); + runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index c23ff38265670..b008459ebad3f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -183,6 +183,10 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes + /// to make sure the masks are simplified. + static void simplifyBlends(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. From a4c2c3db71a88c7cfd65fe2dce61a06495ed31cf Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Apr 2025 13:56:57 +0100 Subject: [PATCH 2/4] Remove dead worklist --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 137ed330c31b2..85a133f569537 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1028,7 +1028,6 @@ void VPlanTransforms::simplifyBlends(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; ReversePostOrderTraversal> RPOT( Plan.getEntry()); - SetVector Worklist; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *Blend = dyn_cast(&R); From ab2e0d93bafd6d28d1fc2067a64b067aadfb30c6 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Apr 2025 14:25:23 +0100 Subject: [PATCH 3/4] Make static --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +++- llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 4 ---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 85a133f569537..bc8eab74fab40 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1024,7 +1024,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } } -void VPlanTransforms::simplifyBlends(VPlan &Plan) { +/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes +/// to make sure the masks are simplified. +static void simplifyBlends(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; ReversePostOrderTraversal> RPOT( Plan.getEntry()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b008459ebad3f..c23ff38265670 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -183,10 +183,6 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); - /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes - /// to make sure the masks are simplified. - static void simplifyBlends(VPlan &Plan); - /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. From 4776c9c350e950ed1e276b5b2621343641058d4b Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Apr 2025 21:53:48 +0100 Subject: [PATCH 4/4] Only shallow traverse vector loop region --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index bc8eab74fab40..7372dedf921f2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1028,9 +1028,8 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; - ReversePostOrderTraversal> RPOT( - Plan.getEntry()); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *Blend = dyn_cast(&R); if (!Blend)