From 9b7097cedf678a87d00e3249e95ded473a89d390 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 13 Nov 2024 00:39:18 +0800 Subject: [PATCH 1/4] Precommit test --- ...rize-force-tail-with-evl-reduction-cost.ll | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll new file mode 100644 index 0000000000000..db4b4a4e5c5ed --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll @@ -0,0 +1,34 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s \ +; RUN: --check-prefix=EVL + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s \ +; RUN: --check-prefix=NO-EVL + +; EVL: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir, ir<%add>, ir<%rdx>, vp<%{{.+}}>) +; EVL: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + +; NO-EVL: Cost of 0 for VF vscale x 4: EMIT vp<%{{.+}}> = select vp<%active.lane.mask>, ir<%add>, ir<%rdx> +; NO-EVL: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + +define i32 @add(ptr %a, i64 %n, i32 %start) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret i32 %add +} From 577290a0b4cbace14fa2aff0dae28a78b9835eae Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 13 Nov 2024 00:52:38 +0800 Subject: [PATCH 2/4] [LV] Account for vp_merge in out of loop EVL reductions in legacy cost model In #101641, support for out of loop reductions with EVL tail folding was added by transforming selects to vp_merges in transformRecipestoEVLRecipes. Whilst the select was previously free, the vp_merge wasn't and incurs a cost on RISC-V with the VPlan cost model. But this diverged from the legacy cost model and caused the "VPlan cost model and legacy cost model disagreed" assertion to trigger when building 502.gcc_r from SPEC CPU 2017. Neither the select nor vp_merge recipes from the VPlan exist in the underlying instructions, so I thought it would make the most sense to fix this by adding the cost to the underlying phi instruction in getInstructionCost. It's worth noting that on RISC-V this vp_merge won't actually generate any instructions because the mask is all true, and will be folded away. So we should update the cost model at some point to reflect that. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++++++ .../vectorize-force-tail-with-evl-reduction-cost.ll | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ebc62f984390..5c3afbe5214fe 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6567,6 +6567,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, CmpInst::BAD_ICMP_PREDICATE, CostKind); } + // When tail folding with EVL, if the phi is part of an out of loop reduction + // then it will be transformed into a wide vp_merge. + if (VF.isVector() && foldTailWithEVL() && + Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { + IntrinsicCostAttributes ICA( + Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF), + {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); + return TTI.getIntrinsicInstrCost(ICA, CostKind); + } + return TTI.getCFInstrCost(Instruction::PHI, CostKind); } case Instruction::UDiv: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll index db4b4a4e5c5ed..6d20731d2502b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll @@ -10,7 +10,7 @@ ; RUN: --check-prefix=NO-EVL ; EVL: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir, ir<%add>, ir<%rdx>, vp<%{{.+}}>) -; EVL: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; EVL: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] ; NO-EVL: Cost of 0 for VF vscale x 4: EMIT vp<%{{.+}}> = select vp<%active.lane.mask>, ir<%add>, ir<%rdx> ; NO-EVL: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] From 6b6819dcd60a5cd9f9d5978b7f421ccfd6a896a7 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 14 Nov 2024 16:10:23 +0900 Subject: [PATCH 3/4] Fix clang-format --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5c3afbe5214fe..d029997e9565f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6567,8 +6567,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, CmpInst::BAD_ICMP_PREDICATE, CostKind); } - // When tail folding with EVL, if the phi is part of an out of loop reduction - // then it will be transformed into a wide vp_merge. + // When tail folding with EVL, if the phi is part of an out of loop + // reduction then it will be transformed into a wide vp_merge. if (VF.isVector() && foldTailWithEVL() && Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { IntrinsicCostAttributes ICA( From c1cf0c2a4cd6d69cc15f7a8caec4b01aa5bf9997 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 14 Nov 2024 16:18:12 +0900 Subject: [PATCH 4/4] Address review comments --- ...rize-force-tail-with-evl-reduction-cost.ll | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll index 6d20731d2502b..aa1bb25af930d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll @@ -1,34 +1,25 @@ ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ ; RUN: -force-tail-folding-style=data-with-evl \ ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ -; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s \ -; RUN: --check-prefix=EVL +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ -; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ -; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s \ -; RUN: --check-prefix=NO-EVL - -; EVL: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir, ir<%add>, ir<%rdx>, vp<%{{.+}}>) -; EVL: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] - -; NO-EVL: Cost of 0 for VF vscale x 4: EMIT vp<%{{.+}}> = select vp<%active.lane.mask>, ir<%add>, ir<%rdx> -; NO-EVL: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; CHECK: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir, ir<%add>, ir<%rdx>, vp<%{{.+}}>) +; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %rdx = phi i32 [ %start, %entry ], [ %add, %loop ] define i32 @add(ptr %a, i64 %n, i32 %start) { entry: - br label %for.body + br label %loop -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %add, %loop ] %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv %0 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %0, %rdx %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %n - br i1 %exitcond.not, label %for.end, label %for.body + br i1 %exitcond.not, label %exit, label %loop -for.end: +exit: ret i32 %add }