[VPlan] Don't convert VPWidenSelectRecipes to vp.select in EVL transform #146695

lukel97 · 2025-07-02T13:51:27Z

createEVLRecipe tries to optimise recipes that use the header mask by replacing them with their VP equivalents and setting the EVL, allowing the mask to be removed.

However we currently also convert widened selects to vp.select even though they don't necessarily use the header mask.

Unlike vp.merge a vp.select only makes the "unused" lanes past EVL poison, so it's not needed for correctness.

In the same vein as #127180, this patch removes the transform for VPWidenSelectRecipes and keeps them as plain select instructions to allow for more optimisations.

RISCVVLOptimizer will still be able to optimise away any VL toggles and we end up with better code generation across llvm-test-suite and SPEC CPU 2017:

--- build.rva23u64-evl-O3.a/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s
+++ build.rva23u64-evl-O3.b/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s
	vmsne.vi        v9, v10, 0
-       vand.vi v10, v16, 1
-       vmsne.vi        v16, v10, 0
-       vmor.mm v9, v9, v16
-       vmandn.mm       v0, v18, v9
+       vmandn.mm       v0, v16, v9

--- build.rva23u64-evl-O3.a/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/
grid/tria.s
+++ build.rva23u64-evl-O3.b/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/
grid/tria.s
@@ -19077,8 +19077,6 @@
        vmv.v.x v8, a2
        vsetvli zero, zero, e32, m1, ta, ma
        vid.v   v11
-       vsetvli zero, zero, e64, m2, ta, ma
-       vmv.v.i v18, 0
        li      t0, 8
        ld      a4, 24(sp)                      # 8-byte Folded Reload
 .LBB37_392:                             # %vector.body
@@ -19112,9 +19110,8 @@
        vsetvli zero, zero, e32, m1, ta, ma
        vsra.vi v16, v16, 6
        vwmaccus.vx     v14, t0, v16
-       vsetvli zero, zero, e64, m2, ta, ma
-       vmerge.vim      v16, v18, -8, v0
-       vadd.vv v14, v14, v16
+       vsetvli zero, zero, e64, m2, ta, mu
+       vadd.vi v14, v14, -8, v0.t
        vluxei64.v      v14, (zero), v14
        addw    a5, a5, s0
        vand.vx v12, v12, s9

--- build.rva23u64-evl-O3.a/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s
+++ build.rva23u64-evl-O3.b/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s
                                         #   Parent Loop BB494_57 Depth=1
@@ -115460,16 +115443,13 @@
        sub     a3, a0, a1
        sh2add  a4, a1, a2
        vsetvli a5, a3, e32, m1, ta, ma
-       vle32.v v14, (a4)
-       vmv2r.v v10, v16
-       vmsltu.vx       v0, v14, s1
-       vsetvli a4, zero, e32, m1, ta, ma
-       vwsll.vv        v12, v15, v14
-       vsetvli zero, a3, e64, m2, ta, mu
-       vnot.v  v10, v12, v0.t
+       vle32.v v12, (a4)
        add     a1, a1, a5
-       vsetvli zero, zero, e64, m2, tu, ma
-       vand.vv v8, v10, v8
+       vmsltu.vx       v0, v12, s1
+       vsetvli a4, zero, e32, m1, ta, ma
+       vwsll.vv        v10, v13, v12
+       vsetvli zero, a3, e64, m2, tu, mu
+       vandn.vv        v8, v8, v10, v0.t

createEVLRecipe tries to optimise recipes that use the header mask by replacing them with their VP equivalents and setting the EVL, allowing the mask to be removed. However we currently also convert widened selects to vp.select even though they don't necessarily use the header mask. Unlike vp.merge a vp.select only makes the "unused" lanes past EVL poison, so it's not needed for correctness. In the same vein as llvm#127180, this patch removes the transform for VPWidenSelectRecipes and keeps them as plain select instructions to allow for more optimisations. RISCVVLOptimizer will still be able to optimise away any VL toggles and we end up with better code generation across llvm-test-suite and SPEC CPU 2017: ```diff --- build.rva23u64-evl-O3.a/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s +++ build.rva23u64-evl-O3.b/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s vmsne.vi v9, v10, 0 - vand.vi v10, v16, 1 - vmsne.vi v16, v10, 0 - vmor.mm v9, v9, v16 - vmandn.mm v0, v18, v9 + vmandn.mm v0, v16, v9 ``` ```diff --- build.rva23u64-evl-O3.a/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/ grid/tria.s +++ build.rva23u64-evl-O3.b/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/ grid/tria.s @@ -19077,8 +19077,6 @@ vmv.v.x v8, a2 vsetvli zero, zero, e32, m1, ta, ma vid.v v11 - vsetvli zero, zero, e64, m2, ta, ma - vmv.v.i v18, 0 li t0, 8 ld a4, 24(sp) # 8-byte Folded Reload .LBB37_392: # %vector.body @@ -19112,9 +19110,8 @@ vsetvli zero, zero, e32, m1, ta, ma vsra.vi v16, v16, 6 vwmaccus.vx v14, t0, v16 - vsetvli zero, zero, e64, m2, ta, ma - vmerge.vim v16, v18, -8, v0 - vadd.vv v14, v14, v16 + vsetvli zero, zero, e64, m2, ta, mu + vadd.vi v14, v14, -8, v0.t vluxei64.v v14, (zero), v14 addw a5, a5, s0 vand.vx v12, v12, s9 ``` ```diff --- build.rva23u64-evl-O3.a/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s +++ build.rva23u64-evl-O3.b/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s # Parent Loop BB494_57 Depth=1 @@ -115460,16 +115443,13 @@ sub a3, a0, a1 sh2add a4, a1, a2 vsetvli a5, a3, e32, m1, ta, ma - vle32.v v14, (a4) - vmv2r.v v10, v16 - vmsltu.vx v0, v14, s1 - vsetvli a4, zero, e32, m1, ta, ma - vwsll.vv v12, v15, v14 - vsetvli zero, a3, e64, m2, ta, mu - vnot.v v10, v12, v0.t + vle32.v v12, (a4) add a1, a1, a5 - vsetvli zero, zero, e64, m2, tu, ma - vand.vv v8, v10, v8 + vmsltu.vx v0, v12, s1 + vsetvli a4, zero, e32, m1, ta, ma + vwsll.vv v10, v13, v12 + vsetvli zero, a3, e64, m2, tu, mu + vandn.vv v8, v8, v10, v0.t ```

llvmbot · 2025-07-02T13:51:58Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Luke Lau (lukel97)

Changes

createEVLRecipe tries to optimise recipes that use the header mask by replacing them with their VP equivalents and setting the EVL, allowing the mask to be removed.

However we currently also convert widened selects to vp.select even though they don't necessarily use the header mask.

Unlike vp.merge a vp.select only makes the "unused" lanes past EVL poison, so it's not needed for correctness.

In the same vein as #127180, this patch removes the transform for VPWidenSelectRecipes and keeps them as plain select instructions to allow for more optimisations.

RISCVVLOptimizer will still be able to optimise away any VL toggles and we end up with better code generation across llvm-test-suite and SPEC CPU 2017:

--- build.rva23u64-evl-O3.a/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s
+++ build.rva23u64-evl-O3.b/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s
	vmsne.vi        v9, v10, 0
-       vand.vi v10, v16, 1
-       vmsne.vi        v16, v10, 0
-       vmor.mm v9, v9, v16
-       vmandn.mm       v0, v18, v9
+       vmandn.mm       v0, v16, v9

--- build.rva23u64-evl-O3.a/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/
grid/tria.s
+++ build.rva23u64-evl-O3.b/External/SPEC/CFP2017rate/510.parest_r/CMakeFiles/510.parest_r.dir/Users/luke/Developer/cpu2017/benchspec/CPU/510.parest_r/src/source/
grid/tria.s
@@ -19077,8 +19077,6 @@
        vmv.v.x v8, a2
        vsetvli zero, zero, e32, m1, ta, ma
        vid.v   v11
-       vsetvli zero, zero, e64, m2, ta, ma
-       vmv.v.i v18, 0
        li      t0, 8
        ld      a4, 24(sp)                      # 8-byte Folded Reload
 .LBB37_392:                             # %vector.body
@@ -19112,9 +19110,8 @@
        vsetvli zero, zero, e32, m1, ta, ma
        vsra.vi v16, v16, 6
        vwmaccus.vx     v14, t0, v16
-       vsetvli zero, zero, e64, m2, ta, ma
-       vmerge.vim      v16, v18, -8, v0
-       vadd.vv v14, v14, v16
+       vsetvli zero, zero, e64, m2, ta, mu
+       vadd.vi v14, v14, -8, v0.t
        vluxei64.v      v14, (zero), v14
        addw    a5, a5, s0
        vand.vx v12, v12, s9

--- build.rva23u64-evl-O3.a/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s
+++ build.rva23u64-evl-O3.b/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s
                                         #   Parent Loop BB494_57 Depth=1
@@ -115460,16 +115443,13 @@
        sub     a3, a0, a1
        sh2add  a4, a1, a2
        vsetvli a5, a3, e32, m1, ta, ma
-       vle32.v v14, (a4)
-       vmv2r.v v10, v16
-       vmsltu.vx       v0, v14, s1
-       vsetvli a4, zero, e32, m1, ta, ma
-       vwsll.vv        v12, v15, v14
-       vsetvli zero, a3, e64, m2, ta, mu
-       vnot.v  v10, v12, v0.t
+       vle32.v v12, (a4)
        add     a1, a1, a5
-       vsetvli zero, zero, e64, m2, tu, ma
-       vand.vv v8, v10, v8
+       vmsltu.vx       v0, v12, s1
+       vsetvli a4, zero, e32, m1, ta, ma
+       vwsll.vv        v10, v13, v12
+       vsetvli zero, a3, e64, m2, tu, mu
+       vandn.vv        v8, v8, v10, v0.t

Patch is 29.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146695.diff

8 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (-7)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll (+23-26)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll (+4-4)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll (+6-6)
(removed) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll (-77)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d4a73c744469..ead7ae5572b8f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2152,13 +2152,6 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
         VPValue *NewMask = GetNewMask(Red->getCondOp());
         return new VPReductionEVLRecipe(*Red, EVL, NewMask);
       })
-      .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
-        SmallVector<VPValue *> Ops(Sel->operands());
-        Ops.push_back(&EVL);
-        return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
-                                          TypeInfo.inferScalarType(Sel),
-                                          Sel->getDebugLoc());
-      })
       .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
         if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
           assert(PrevEVL && "Fixed-order recurrences require previous EVL");
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index c063e6ff3f482..d485a7432423a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -367,7 +367,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 6cfb4bcdd991a..45357dd6bf0d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -138,20 +138,19 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_EVL-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
 ; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]
-; PREDICATED_EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1> [[TMP13]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_EVL-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> align 1 [[TMP19]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_EVL-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_EVL-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_EVL-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
 ; PREDICATED_EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
 ; PREDICATED_EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; PREDICATED_EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_EVL:       middle.block:
 ; PREDICATED_EVL-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_EVL:       scalar.ph:
@@ -364,29 +363,27 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
 ; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = icmp slt <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]
-; PREDICATED_EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
-; PREDICATED_EVL-NEXT:    [[TMP22:%.*]] = icmp slt <vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]
-; PREDICATED_EVL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1> [[TMP22]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP24:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP23]]
-; PREDICATED_EVL-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_EVL-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_EVL-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
+; PREDICATED_EVL-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_EVL-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_EVL-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
 ; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; PREDICATED_EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
 ; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_EVL-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP31:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP31]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP24]], <vscale x 16 x ptr> align 1 [[TMP32]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
 ; PREDICATED_EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
 ; PREDICATED_EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; PREDICATED_EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_EVL-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; PREDICATED_EVL:       middle.block:
 ; PREDICATED_EVL-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_EVL:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
index 93bd44f5c6220..e4892db592d6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
@@ -9,7 +9,7 @@
 define void @vp_select(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; DEBUGLOC-LABEL: define void @vp_select(
 ; DEBUGLOC: vector.body:
-; DEBUGLOC:   = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> %{{.+}}, <vscale x 4 x i32> %{{.+}}, <vscale x 4 x i32> %{{.+}}, i32 %{{.+}}), !dbg ![[SELLOC:[0-9]+]]
+; DEBUGLOC:   = select <vscale x 4 x i1> %{{.+}}, <vscale x 4 x i32> %{{.+}}, <vscale x 4 x i32> %{{.+}}, !dbg ![[SELLOC:[0-9]+]]
 ; DEBUGLOC: loop:
 ; DEBUGLOC:   = select i1 %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, !dbg ![[SELLOC]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index f059730a245be..1e1ed49f9f2ed 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -44,7 +44,7 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[TMP23:%.*]] = ashr <vscale x 8 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[VP_OP3:%.*]] = or <vscale x 8 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <vscale x 8 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vp.select.nxv8i32(<vscale x 8 x i1> [[TMP16]], <vscale x 8 x i32> [[VP_OP3]], <vscale x 8 x i32> zeroinitializer, i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP17:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i32> [[VP_OP3]], <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <vscale x 8 x i32> [[TMP17]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP24]], <vscale x 8 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <vscale x 8 x i32> [[VP_OP]] to <vscale x 8 x i16>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 8c44da63e08a6..2926011857ae9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -47,7 +47,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP11]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[TMP19]], [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP20]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP11]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP11]] to i64
@@ -101,7 +101,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
-; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
@@ -543,7 +543,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP12]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
@@ -606,7 +606,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND]]
-; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP11]])
+; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-INLOOP-NEXT:    [[ADD]] = add i32 [[TMP17]], [[RDX]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP11]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 83deebe42d38f..8b1441450dd94 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -602,7 +602,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
@@ -723,7 +723,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inb...
[truncated]

lukel97 · 2025-07-02T13:52:41Z

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll

+; PREDICATED_EVL-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_EVL-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>


An example of an optimisation InstCombine can do with a regular select. As a side note, should these tests be running instcombine to begin with?

preames · 2025-07-02T15:26:38Z

However we currently also convert widened selects to vp.select even though they don't necessarily use the header mask.

This bit doesn't seem to match the code. We only iterate through the users of the headermask calling createEVLRecipe. You are right that we don't seem to rewrite the mask operand in the expected way though. This patch otherwise makes sense, just making sure you don't have a bad assumption on your way to this outcome.

lukel97 · 2025-07-02T15:38:32Z

We only iterate through the users of the headermask calling createEVLRecipe.

This bit tripped me up when I was looking at #146672, but we iterate recursively through the users IIUC. So a select may not use the header mask directly, but another recipe that used the header mask.

preames · 2025-07-02T22:09:43Z

We only iterate through the users of the headermask calling createEVLRecipe.

This bit tripped me up when I was looking at #146672, but we iterate recursively through the users IIUC. So a select may not use the header mask directly, but another recipe that used the header mask.

Hm, does that mean we're implicitly assuming all transitive users have poison for the lanes masked off for the header mask? I can't think of a counter example off the top of my head, but that seems like a rather strong assumption.

preames

LGTM.

This seems to produce better code in practice, but it feels like we're missing an opportunity here. If we have a select whose mask is the headermask, that should be a vmv.v.v at worst and possibly elided entirely. If we have a select whose mask includes the headermask, we should be able to simplify the mask. Can we get the best of both worlds here somehow?

lukel97 · 2025-07-02T22:51:59Z

Hm, does that mean we're implicitly assuming all transitive users have poison for the lanes masked off for the header mask? I can't think of a counter example off the top of my head, but that seems like a rather strong assumption.

Agreed, so I'd prefer it if we kept the transforms in createEVLRecipe to just those that operate on recipes that are already masked by the header mask.

If we have a select whose mask is the headermask, that should be a vmv.v.v at worst and possibly elided entirely. If we have a select whose mask includes the headermask, we should be able to simplify the mask. Can we get the best of both worlds here somehow?

Yeah we currently do this for Select VPInstructions where the mask is the header mask, and I'm hoping that with #133993 we will be able to simplify more of them.

But this patch deals with VPWidenSelectRecipes which are separate, i.e. selects that come directly from the scalar code. And from what I understand we don't predicate these with the header mask to begin with, so I don't think we're "missing" out on any simplifications by removing the transform.

fhahn

LGTM, thanks. This is in line with moving towards only generating vp intrinsics when necessary

lukel97 requested review from Mel-Chen, fhahn, preames, mshockwave, alexey-bataev and LiqinWeng July 2, 2025 13:51

llvmbot added vectorizers llvm:transforms labels Jul 2, 2025

lukel97 commented Jul 2, 2025

View reviewed changes

lukel97 mentioned this pull request Jul 2, 2025

[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

Open

preames approved these changes Jul 2, 2025

View reviewed changes

Merge branch 'main' into loop-vectorize/vpwidenselect-no-evl

03f331c

fhahn approved these changes Jul 3, 2025

View reviewed changes

lukel97 merged commit ec25a05 into llvm:main Jul 3, 2025
8 of 9 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[VPlan] Don't convert VPWidenSelectRecipes to vp.select in EVL transform #146695

[VPlan] Don't convert VPWidenSelectRecipes to vp.select in EVL transform #146695

Uh oh!

lukel97 commented Jul 2, 2025

Uh oh!

llvmbot commented Jul 2, 2025 •

edited

Loading

Uh oh!

lukel97 Jul 2, 2025

Uh oh!

preames commented Jul 2, 2025

Uh oh!

lukel97 commented Jul 2, 2025

Uh oh!

preames commented Jul 2, 2025

Uh oh!

preames left a comment

Uh oh!

lukel97 commented Jul 2, 2025

Uh oh!

fhahn left a comment

Uh oh!

Uh oh!

Uh oh!

		; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
		; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>

[VPlan] Don't convert VPWidenSelectRecipes to vp.select in EVL transform #146695

[VPlan] Don't convert VPWidenSelectRecipes to vp.select in EVL transform #146695

Uh oh!

Conversation

lukel97 commented Jul 2, 2025

Uh oh!

llvmbot commented Jul 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

lukel97 Jul 2, 2025

Choose a reason for hiding this comment

Uh oh!

preames commented Jul 2, 2025

Uh oh!

lukel97 commented Jul 2, 2025

Uh oh!

preames commented Jul 2, 2025

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

lukel97 commented Jul 2, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jul 2, 2025 •

edited

Loading