From 50be0596277ed0ff5fd4e38eec2dd75f8fd56a66 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 22 Apr 2025 21:17:27 +0100 Subject: [PATCH 1/5] [VPlan] Add convertToUniformRecipe transform. Add a new convertToUniformRecipes transform which uses VPlan-based uniformity analysis to determine if wide recipes and replicate recipes can be converted to uniform recipes. There are a few places where we ad-hoc convert recipes to uniform recipes, which this transform will eventually replace. There are a few more generalizations required to do so which I plan to do as follow-ups. By converting the recipes to uniform recipes, we effectively materialize the information from the VPlan-based analysis. Note that there is one regression at the moment in SystemZ/pr47665.ll due to trivial constant folding opportunities in the input IR. This will be fixed by VPlan-based constant folding (https://github.com/llvm/llvm-project/pull/125365/) --- .../Transforms/Vectorize/VPlanTransforms.cpp | 35 +++++++++++++++++++ .../LoopVectorize/SystemZ/pr47665.ll | 33 ++++++++--------- .../LoopVectorize/X86/cost-model.ll | 5 +-- .../version-stride-with-integer-casts.ll | 11 +++--- 4 files changed, 57 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 79ddb8bf0b09b..50552c843cd59 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1084,6 +1084,40 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } } +static void convertToUniformRecipes(VPlan &Plan) { + auto TryToNarrow = [](VPBasicBlock *VPBB) { + for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { + // Try to narrow wide and replicating recipes to uniform recipes, based on + // VPlan analysis. + auto *Def = dyn_cast(&R); + if (!Def || !isa(Def) || + !Def->getUnderlyingValue()) + continue; + + auto *RepR = dyn_cast(&R); + if (RepR && RepR->isUniform()) + continue; + + // Skip recipes that aren't uniform and don't have only their scalar + // results used. In the later case, we would introduce extra broadcasts. + if (!vputils::isUniformAfterVectorization(Def) || + any_of(Def->users(), + [Def](VPUser *U) { return !U->usesScalars(Def); })) + continue; + + auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(), + Def->operands(), /*IsUniform*/ true); + Clone->insertBefore(Def); + Def->replaceAllUsesWith(Clone); + Def->eraseFromParent(); + } + }; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) + TryToNarrow(VPBB); +} + /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { @@ -1778,6 +1812,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); + runPass(convertToUniformRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 02a876a3fda67..bb96c166f894c 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -7,86 +7,87 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i1 true, false ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; CHECK: pred.store.if1: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] ; CHECK: pred.store.if29: -; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index f8b1cc2d775f5..7c42c3d9cd52e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -890,9 +890,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -904,7 +902,6 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1) ; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1) ; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1) -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index fb84739881010..30e0acb4d7bf6 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -159,9 +159,6 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) { ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[SCALE]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]] -; CHECK-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]] -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]] ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -174,10 +171,10 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP16]] -; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP11]], align 8 -; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP13]], align 8 -; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP15]], align 8 -; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8 +; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP11]], align 8 +; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP13]], align 8 +; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] From a8dda3f680ae7148440747694ee189d2cf131aba Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 15 May 2025 21:18:46 +0100 Subject: [PATCH 2/5] !fixup address latest comments. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3803dd20d866b..d40f8d59d4f24 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1087,37 +1087,37 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } static void convertToUniformRecipes(VPlan &Plan) { - auto TryToNarrow = [](VPBasicBlock *VPBB) { + if (Plan.hasScalarVFOnly()) + return; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. - auto *Def = dyn_cast(&R); - if (!Def || !isa(Def) || - !Def->getUnderlyingValue()) - continue; - auto *RepR = dyn_cast(&R); + if (!RepR && !isa(&R)) + continue; if (RepR && RepR->isUniform()) continue; + auto *RepOrWiden = cast(&R); // Skip recipes that aren't uniform and don't have only their scalar // results used. In the later case, we would introduce extra broadcasts. - if (!vputils::isUniformAfterVectorization(Def) || - any_of(Def->users(), - [Def](VPUser *U) { return !U->usesScalars(Def); })) + if (!vputils::isUniformAfterVectorization(RepOrWiden) || + any_of(RepOrWiden->users(), [RepOrWiden](VPUser *U) { + return !U->usesScalars(RepOrWiden); + })) continue; - auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(), - Def->operands(), /*IsUniform*/ true); - Clone->insertBefore(Def); - Def->replaceAllUsesWith(Clone); - Def->eraseFromParent(); + auto *Clone = + new VPReplicateRecipe(RepOrWiden->getUnderlyingInstr(), + RepOrWiden->operands(), /*IsUniform*/ true); + Clone->insertBefore(RepOrWiden); + RepOrWiden->replaceAllUsesWith(Clone); + RepOrWiden->eraseFromParent(); } - }; - - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( - vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) - TryToNarrow(VPBB); + } } /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes From a7e9545a415359c9f82f146eee1faa8ec9fa621a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 15 May 2025 21:37:35 +0100 Subject: [PATCH 3/5] !fixup naming, later->latter --- .../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d40f8d59d4f24..9270f48718233 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1101,21 +1101,21 @@ static void convertToUniformRecipes(VPlan &Plan) { if (RepR && RepR->isUniform()) continue; - auto *RepOrWiden = cast(&R); + auto *RepOrWidenR = cast(&R); // Skip recipes that aren't uniform and don't have only their scalar - // results used. In the later case, we would introduce extra broadcasts. - if (!vputils::isUniformAfterVectorization(RepOrWiden) || - any_of(RepOrWiden->users(), [RepOrWiden](VPUser *U) { - return !U->usesScalars(RepOrWiden); + // results used. In the latter case, we would introduce extra broadcasts. + if (!vputils::isUniformAfterVectorization(RepOrWidenR) || + any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) { + return !U->usesScalars(RepOrWidenR); })) continue; auto *Clone = - new VPReplicateRecipe(RepOrWiden->getUnderlyingInstr(), - RepOrWiden->operands(), /*IsUniform*/ true); - Clone->insertBefore(RepOrWiden); - RepOrWiden->replaceAllUsesWith(Clone); - RepOrWiden->eraseFromParent(); + new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), + RepOrWidenR->operands(), /*IsUniform*/ true); + Clone->insertBefore(RepOrWidenR); + RepOrWidenR->replaceAllUsesWith(Clone); + RepOrWidenR->eraseFromParent(); } } } From 451d82a044d47e7eb24754300103f304cdc379bb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 16 May 2025 21:05:21 +0100 Subject: [PATCH 4/5] !fixup address comments, thanks --- .../Transforms/Vectorize/VPlanTransforms.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b7555ff1e3e44..eff5724667227 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1085,36 +1085,36 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } } -static void convertToUniformRecipes(VPlan &Plan) { +static void narrowToSingleScalarRecipes(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - // Try to narrow wide and replicating recipes to uniform recipes, based on - // VPlan analysis. + // Try to narrow wide and replicating recipes to single scalar recipes, + // based on VPlan analysis. auto *RepR = dyn_cast(&R); if (!RepR && !isa(&R)) continue; - if (RepR && RepR->isUniform()) + if (RepR && RepR->isSingleScalar()) continue; auto *RepOrWidenR = cast(&R); - // Skip recipes that aren't uniform and don't have only their scalar - // results used. In the latter case, we would introduce extra broadcasts. - if (!vputils::isUniformAfterVectorization(RepOrWidenR) || + // Skip recipes that aren't single scalars and don't have only their + // scalar results used. In the latter case, we would introduce extra + // broadcasts. + if (!vputils::isSingleScalar(RepOrWidenR) || any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) { return !U->usesScalars(RepOrWidenR); })) continue; - auto *Clone = - new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), - RepOrWidenR->operands(), /*IsUniform*/ true); + auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), + RepOrWidenR->operands(), + true /*IsSingleScalar*/); Clone->insertBefore(RepOrWidenR); RepOrWidenR->replaceAllUsesWith(Clone); - RepOrWidenR->eraseFromParent(); } } } @@ -1813,7 +1813,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); - runPass(convertToUniformRecipes, Plan); + runPass(narrowToSingleScalarRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); From 5b15a9be581fb9da82df18ad5c8f2cae7af69f3e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 17 May 2025 22:53:11 +0100 Subject: [PATCH 5/5] !fixup update comment, add comment re regions. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index eff5724667227..8c8297bb1ae94 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1089,11 +1089,13 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; + // Try to narrow wide and replicating recipes to single scalar recipes, + // based on VPlan analysis. Only process blocks in the loop region for now, + // without traversing into nested regions, as recipes in replicate regions + // cannot be converted yet. for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - // Try to narrow wide and replicating recipes to single scalar recipes, - // based on VPlan analysis. auto *RepR = dyn_cast(&R); if (!RepR && !isa(&R)) continue; @@ -1101,7 +1103,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast(&R); - // Skip recipes that aren't single scalars and don't have only their + // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) ||