From 0bb35d87f2e076de5c498b5667e4db75c8e47d2c Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 26 Feb 2024 14:42:00 +0800 Subject: [PATCH 1/5] [RISCV] Handle fixed length vectors with exact VLEN in lowerINSERT_SUBVECTOR This is the insert_subvector equivalent to #79949, where we can avoid sliding up by the full LMUL amount if we know the exact subregister the subvector will be inserted into. This mirrors the lowerEXTRACT_SUBVECTOR changes in that we handle this in two parts: - We handle fixed length subvector types by converting the subvector to a scalable vector. But unlike EXTRACT_SUBVECTOR, we may also need to convert the vector being inserted into too. - Whenever we don't need a vslideup because either the subvector aligns to a vector register group *or* the vector is undef, we need to emit an insert_subreg ourselves because RISCVISelDAGToDAG::Select doesn't correctly handle fixed length subvectors yet: see d7a28f7ad I've left RISCVISelDAGToDAG::Select untouched for now (minus relaxing an invariant), so that the insert_subvector and extract_subvector code paths are the same. We should teach it to properly handle fixed length subvectors in a follow-up patch, so that the "exact subregsiter" logic is handled in one place instead of being spread across both RISCVISelDAGToDAG.cpp and RISCVISelLowering.cpp. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 130 ++++++++--- .../rvv/fixed-vectors-insert-subvector.ll | 163 +++++++++----- .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 205 ++++++++++-------- 4 files changed, 334 insertions(+), 172 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index da1543bd7112a..cc8d4bb0c7dab 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2099,8 +2099,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { MVT SubVecContainerVT = SubVecVT; // Establish the correct scalable-vector types for any fixed-length type. if (SubVecVT.isFixedLengthVector()) { - assert(Idx == 0 && V.isUndef()); SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); + bool AlignedToVecReg = false; + if (auto VLen = Subtarget->getRealVLen(); + VLen && SubVecVT.getSizeInBits() == + SubVecContainerVT.getSizeInBits().getKnownMinValue() * + (*VLen / RISCV::RVVBitsPerBlock)) + AlignedToVecReg = true; + assert(Idx == 0 && (AlignedToVecReg || V.isUndef())); } MVT ContainerVT = VT; if (VT.isFixedLengthVector()) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fe8edcf39681d..9bb984632124e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9723,6 +9723,21 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op, Vec, Mask, VL, DL, DAG, Subtarget); } +/// Returns true if \p LHS is known to be equal to \p RHS, taking into account +/// if VLEN is exactly known by \p Subtarget and thus vscale when handling +/// scalable quantities. +static bool isKnownEQ(ElementCount LHS, ElementCount RHS, + const RISCVSubtarget &Subtarget) { + if (auto VLen = Subtarget.getRealVLen()) { + const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; + if (LHS.isScalable()) + LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale); + if (RHS.isScalable()) + RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale); + } + return LHS == RHS; +} + SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -9772,12 +9787,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, } } - // If the subvector vector is a fixed-length type, we cannot use subregister - // manipulation to simplify the codegen; we don't know which register of a - // LMUL group contains the specific subvector as we only know the minimum - // register size. Therefore we must slide the vector group up the full - // amount. - if (SubVecVT.isFixedLengthVector()) { + // If the subvector vector is a fixed-length type and we don't know VLEN + // exactly, we cannot use subregister manipulation to simplify the codegen; we + // don't know which register of a LMUL group contains the specific subvector + // as we only know the minimum register size. Therefore we must slide the + // vector group up the full amount. + const auto VLen = Subtarget.getRealVLen(); + if (SubVecVT.isFixedLengthVector() && !VLen) { if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector()) return Op; MVT ContainerVT = VecVT; @@ -9825,41 +9841,92 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, return DAG.getBitcast(Op.getValueType(), SubVec); } - unsigned SubRegIdx, RemIdx; - std::tie(SubRegIdx, RemIdx) = - RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( - VecVT, SubVecVT, OrigIdx, TRI); + MVT ContainerVecVT = VecVT; + if (VecVT.isFixedLengthVector()) { + ContainerVecVT = getContainerForFixedLengthVector(VecVT); + Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget); + } - RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + MVT ContainerSubVecVT = SubVecVT; + if (SubVecVT.isFixedLengthVector()) { + ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT); + SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget); + } + + unsigned SubRegIdx; + ElementCount RemIdx; + // insert_subvector scales the index by vscale if the subvector is scalable, + // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if + // we have a fixed length subvector, we need to adjust the index by 1/vscale. + if (SubVecVT.isFixedLengthVector()) { + assert(VLen); + unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; + auto Decompose = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI); + SubRegIdx = Decompose.first; + RemIdx = ElementCount::getFixed((Decompose.second * Vscale) + + (OrigIdx % Vscale)); + } else { + auto Decompose = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI); + SubRegIdx = Decompose.first; + RemIdx = ElementCount::getScalable(Decompose.second); + } + + RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT); bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 || SubVecLMUL == RISCVII::VLMUL::LMUL_F4 || SubVecLMUL == RISCVII::VLMUL::LMUL_F8; + bool AlignedToVecReg = !IsSubVecPartReg; + if (SubVecVT.isFixedLengthVector()) + AlignedToVecReg &= SubVecVT.getSizeInBits() == + ContainerSubVecVT.getSizeInBits().getKnownMinValue() * + (*VLen / RISCV::RVVBitsPerBlock); // 1. If the Idx has been completely eliminated and this subvector's size is // a vector register or a multiple thereof, or the surrounding elements are // undef, then this is a subvector insert which naturally aligns to a vector // register. These can easily be handled using subregister manipulation. - // 2. If the subvector is smaller than a vector register, then the insertion - // must preserve the undisturbed elements of the register. We do this by - // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type - // (which resolves to a subregister copy), performing a VSLIDEUP to place the - // subvector within the vector register, and an INSERT_SUBVECTOR of that + // 2. If the subvector isn't exactly aligned to a vector register group, then + // the insertion must preserve the undisturbed elements of the register. We do + // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector + // type (which resolves to a subregister copy), performing a VSLIDEUP to place + // the subvector within the vector register, and an INSERT_SUBVECTOR of that // LMUL=1 type back into the larger vector (resolving to another subregister // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type // to avoid allocating a large register group to hold our subvector. - if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef())) + if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) { + if (SubVecVT.isFixedLengthVector()) { + // We may get NoSubRegister if inserting at index 0 and the subvec + // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0 + if (SubRegIdx == RISCV::NoSubRegister) { + assert(OrigIdx == 0); + return Op; + } + + SDValue Insert = + DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec); + if (VecVT.isFixedLengthVector()) + Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget); + return Insert; + } return Op; + } // VSLIDEUP works by leaving elements 0 @insert_nxv8i32_v2i32_0( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_0: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma -; CHECK-NEXT: vmv.v.v v8, v12 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_0: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; VLA-NEXT: vmv.v.v v8, v12 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_0: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; VLS-NEXT: vmv.v.v v8, v12 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 0) ret %v } define @insert_nxv8i32_v2i32_2( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 2 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_2: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; VLA-NEXT: vslideup.vi v8, v12, 2 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_2: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v8, v12, 2 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 2) ret %v } define @insert_nxv8i32_v2i32_6( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_6: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 6 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_6: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; VLA-NEXT: vslideup.vi v8, v12, 6 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_6: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v9, v12, 2 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 6) ret %v @@ -58,9 +82,7 @@ define @insert_nxv8i32_v8i32_0( %vec, ptr % ; ; VLS-LABEL: insert_nxv8i32_v8i32_0: ; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v12, (a0) -; VLS-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; VLS-NEXT: vmv.v.v v8, v12 +; VLS-NEXT: vl2re32.v v8, (a0) ; VLS-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) @@ -78,9 +100,7 @@ define @insert_nxv8i32_v8i32_8( %vec, ptr % ; ; VLS-LABEL: insert_nxv8i32_v8i32_8: ; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v12, (a0) -; VLS-NEXT: vsetivli zero, 16, e32, m4, tu, ma -; VLS-NEXT: vslideup.vi v8, v12, 8 +; VLS-NEXT: vl2re32.v v10, (a0) ; VLS-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 8) @@ -98,6 +118,31 @@ define @insert_nxv8i32_undef_v2i32_0(ptr %svp) { ret %v } +define @insert_nxv8i32_v4i32_0( %vec, <4 x i32> %subvec) { +; VLA-LABEL: insert_nxv8i32_v4i32_0: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; VLA-NEXT: vmv.v.v v8, v9 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v4i32_0: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v8, v9 +; VLS-NEXT: ret + %v = call @llvm.vector.insert.nxv2i32.v4i32( %vec, <4 x i32> %subvec, i64 0) + ret %v +} + + +define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: insert_v4i32_v4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0) + ret <4 x i32> %v +} + define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v4i32_v2i32_0: ; VLA: # %bb.0: @@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) { ret void } +; This tests the code path in RISCVISelDAGToDAG::Select where we select an +; insert_subvector with a fixed vector and fixed subvector type. The phi here is +; used to prevent the fixed insert_subvector from being combined away into a +; scalable insert_subvector. +define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) { +; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: bnez a0, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: .LBB11_2: # %bar +; CHECK-NEXT: ret +entry: + br i1 %cond, label %foo, label %bar +foo: + %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0) + br label %bar +bar: + %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry] + ret <4 x i32> %w +} + + define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_0: ; VLA: # %bb.0: @@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vle32.v v8, (a1) ; VLS-NEXT: vl2re32.v v10, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; VLS-NEXT: vmv.v.v v10, v8 ; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret @@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; VLS-LABEL: insert_v8i32_v2i32_2: ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vle32.v v10, (a1) -; VLS-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; VLS-NEXT: vslideup.vi v8, v10, 2 -; VLS-NEXT: vs2r.v v8, (a0) +; VLS-NEXT: vle32.v v8, (a1) +; VLS-NEXT: vl2re32.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v10, v8, 2 +; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) { ; VLS-LABEL: insert_v8i32_v2i32_6: ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vle32.v v10, (a1) -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v8, v10, 6 -; VLS-NEXT: vs2r.v v8, (a0) +; VLS-NEXT: vle32.v v8, (a1) +; VLS-NEXT: vl2re32.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v11, v8, 2 +; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) { ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vle32.v v8, (a1) -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v10, v8, 6 -; VLS-NEXT: vs2r.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v9, v8, 2 +; VLS-NEXT: vs2r.v v8, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6) @@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) { ; VLS-LABEL: insert_v2i64_nxv16i64: ; VLS: # %bb.0: ; VLS-NEXT: vl1re64.v v8, (a0) -; VLS-NEXT: vl1re64.v v16, (a1) -; VLS-NEXT: vsetivli zero, 6, e64, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v16, 4 +; VLS-NEXT: vl1re64.v v10, (a1) ; VLS-NEXT: vs8r.v v8, (a2) ; VLS-NEXT: ret %sv0 = load <2 x i64>, ptr %psv0 @@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) { ; ; VLS-LABEL: insert_v2i64_nxv16i64_lo2: ; VLS: # %bb.0: -; VLS-NEXT: vl1re64.v v8, (a0) -; VLS-NEXT: vsetivli zero, 4, e64, m8, ta, ma -; VLS-NEXT: vslideup.vi v16, v8, 2 -; VLS-NEXT: vs8r.v v16, (a1) +; VLS-NEXT: vl1re64.v v9, (a0) +; VLS-NEXT: vs8r.v v8, (a1) ; VLS-NEXT: ret %sv = load <2 x i64>, ptr %psv %v = call @llvm.vector.insert.v2i64.nxv16i64( undef, <2 x i64> %sv, i64 2) @@ -633,7 +699,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret -; ; RV64-LABEL: insert_v2i64_nxv16i64_hi: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index 98e6b8f2dd760..76d6f8931d284 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -6,25 +6,39 @@ ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: concat_2xv4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_2xv4i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_2xv4i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %ab } define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { -; CHECK-LABEL: concat_4xv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_4xv2i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vslideup.vi v8, v9, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_4xv2i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v13, v10 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vmv1r.v v12, v8 +; VLS-NEXT: vslideup.vi v13, v11, 2 +; VLS-NEXT: vslideup.vi v12, v9, 2 +; VLS-NEXT: vmv2r.v v8, v12 +; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> @@ -32,21 +46,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x } define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) { -; CHECK-LABEL: concat_8xv1i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v14, v15, 1 -; CHECK-NEXT: vslideup.vi v12, v13, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 1 -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_8xv1i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vslideup.vi v14, v15, 1 +; VLA-NEXT: vslideup.vi v12, v13, 1 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 2 +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 1 +; VLA-NEXT: vslideup.vi v8, v9, 1 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_8xv1i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v17, v12 +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vslideup.vi v14, v15, 1 +; VLS-NEXT: vmv1r.v v16, v8 +; VLS-NEXT: vslideup.vi v17, v13, 1 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v17, v14, 2 +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vslideup.vi v10, v11, 1 +; VLS-NEXT: vslideup.vi v16, v9, 1 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v16, v10, 2 +; VLS-NEXT: vmv2r.v v8, v16 +; VLS-NEXT: ret %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> @@ -58,28 +89,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x } define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: concat_2xv8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_2xv8i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv2r.v v12, v10 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_2xv8i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %v } define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { -; CHECK-LABEL: concat_4xv4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v14, v11 -; CHECK-NEXT: vmv1r.v v12, v10 -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 4 -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_4xv4i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv1r.v v14, v11 +; VLA-NEXT: vmv1r.v v12, v10 +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_4xv4i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> @@ -87,21 +126,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x } define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) { -; CHECK-LABEL: concat_8xv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v14, v15, 2 -; CHECK-NEXT: vslideup.vi v12, v13, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_8xv2i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v14, v15, 2 +; VLA-NEXT: vslideup.vi v12, v13, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vslideup.vi v8, v9, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_8xv2i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v19, v14 +; VLS-NEXT: vmv1r.v v18, v12 +; VLS-NEXT: vmv1r.v v17, v10 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vmv1r.v v16, v8 +; VLS-NEXT: vslideup.vi v19, v15, 2 +; VLS-NEXT: vslideup.vi v18, v13, 2 +; VLS-NEXT: vslideup.vi v17, v11, 2 +; VLS-NEXT: vslideup.vi v16, v9, 2 +; VLS-NEXT: vmv4r.v v8, v16 +; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> @@ -123,9 +176,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) { ; ; VLS-LABEL: concat_2xv16i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv4r.v v16, v12 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %ab @@ -147,14 +197,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x ; ; VLS-LABEL: concat_4xv8i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv2r.v v20, v14 -; VLS-NEXT: vmv2r.v v16, v12 -; VLS-NEXT: vmv2r.v v12, v10 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v16, v20, 8 -; VLS-NEXT: vslideup.vi v8, v12, 8 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> @@ -189,25 +231,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; ; VLS-LABEL: concat_8xv4i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv1r.v v18, v15 -; VLS-NEXT: vmv1r.v v20, v14 -; VLS-NEXT: vmv1r.v v22, v13 -; VLS-NEXT: vmv1r.v v16, v12 -; VLS-NEXT: vmv1r.v v14, v11 -; VLS-NEXT: vmv1r.v v12, v10 -; VLS-NEXT: vmv1r.v v10, v9 -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v20, v18, 4 -; VLS-NEXT: vslideup.vi v16, v22, 4 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v16, v20, 8 -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v12, v14, 4 -; VLS-NEXT: vslideup.vi v8, v10, 4 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v8, v12, 8 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> From 1a1a661a3bc6bec69436fe928b4f91589fcdbf8a Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 1 Apr 2024 14:02:36 +0800 Subject: [PATCH 2/5] Address review comments: * Rework the subvec size check by introducing a isKnownMultipleOf helper function on TypeSize/ElementCount. We can then reason that the subvector is exactly vector register sized if it is a multiple of the vector register size. * Rename variables to clarify we are checking the subvec size * Update wording of comment --- llvm/include/llvm/Support/TypeSize.h | 12 +++++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 14 +++--- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 +++++++-------------- llvm/lib/Target/RISCV/RISCVSubtarget.h | 11 +++++ 4 files changed, 46 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index 68dbe1ea3062a..c6779e258be7c 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -181,6 +181,18 @@ template class FixedOrScalableQuantity { return getKnownMinValue() % RHS == 0; } + /// Returns whether or not the callee is known to be a multiple of RHS. + constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const { + // x % y == 0 => x % y == 0 + // x % y == 0 => (vscale * x) % y == 0 + // x % y == 0 => (vscale * x) % (vscale * y) == 0 + // but + // x % y == 0 !=> x % (vscale * y) == 0 + if (!isScalable() && RHS.isScalable()) + return false; + return getKnownMinValue() % RHS.getKnownMinValue() == 0; + } + // Return the minimum value with the assumption that the count is exact. // Use in places where a scalable count doesn't make sense (e.g. non-vector // types, or vectors in backends which don't support scalable vectors). diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index cc8d4bb0c7dab..278476c533038 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2100,13 +2100,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Establish the correct scalable-vector types for any fixed-length type. if (SubVecVT.isFixedLengthVector()) { SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); - bool AlignedToVecReg = false; - if (auto VLen = Subtarget->getRealVLen(); - VLen && SubVecVT.getSizeInBits() == - SubVecContainerVT.getSizeInBits().getKnownMinValue() * - (*VLen / RISCV::RVVBitsPerBlock)) - AlignedToVecReg = true; - assert(Idx == 0 && (AlignedToVecReg || V.isUndef())); +#ifndef NDEBUG + TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + bool ExactlyVecRegSized = + Subtarget->expandVScale(SubVecVT.getSizeInBits()) + .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize)); + assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef())); +#endif } MVT ContainerVT = VT; if (VT.isFixedLengthVector()) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9bb984632124e..122c475267048 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9723,21 +9723,6 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op, Vec, Mask, VL, DL, DAG, Subtarget); } -/// Returns true if \p LHS is known to be equal to \p RHS, taking into account -/// if VLEN is exactly known by \p Subtarget and thus vscale when handling -/// scalable quantities. -static bool isKnownEQ(ElementCount LHS, ElementCount RHS, - const RISCVSubtarget &Subtarget) { - if (auto VLen = Subtarget.getRealVLen()) { - const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; - if (LHS.isScalable()) - LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale); - if (RHS.isScalable()) - RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale); - } - return LHS == RHS; -} - SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -9875,29 +9860,25 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, RemIdx = ElementCount::getScalable(Decompose.second); } - RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT); - bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 || - SubVecLMUL == RISCVII::VLMUL::LMUL_F4 || - SubVecLMUL == RISCVII::VLMUL::LMUL_F8; - bool AlignedToVecReg = !IsSubVecPartReg; - if (SubVecVT.isFixedLengthVector()) - AlignedToVecReg &= SubVecVT.getSizeInBits() == - ContainerSubVecVT.getSizeInBits().getKnownMinValue() * - (*VLen / RISCV::RVVBitsPerBlock); + TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + bool ExactlyVecRegSized = + Subtarget.expandVScale(SubVecVT.getSizeInBits()) + .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize)); // 1. If the Idx has been completely eliminated and this subvector's size is // a vector register or a multiple thereof, or the surrounding elements are // undef, then this is a subvector insert which naturally aligns to a vector // register. These can easily be handled using subregister manipulation. - // 2. If the subvector isn't exactly aligned to a vector register group, then - // the insertion must preserve the undisturbed elements of the register. We do - // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector - // type (which resolves to a subregister copy), performing a VSLIDEUP to place - // the subvector within the vector register, and an INSERT_SUBVECTOR of that - // LMUL=1 type back into the larger vector (resolving to another subregister - // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type - // to avoid allocating a large register group to hold our subvector. - if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) { + // 2. If the subvector isn't an exact multiple of a valid register group size, + // then the insertion must preserve the undisturbed elements of the register. + // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 + // vector type (which resolves to a subregister copy), performing a VSLIDEUP + // to place the subvector within the vector register, and an INSERT_SUBVECTOR + // of that LMUL=1 type back into the larger vector (resolving to another + // subregister operation). See below for how our VSLIDEUP works. We go via a + // LMUL=1 type to avoid allocating a large register group to hold our + // subvector. + if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) { if (SubVecVT.isFixedLengthVector()) { // We may get NoSubRegister if inserting at index 0 and the subvec // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0 @@ -9944,7 +9925,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, // Use tail agnostic policy if we're inserting over InterSubVT's tail. unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED; - if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget)) + if (Subtarget.expandVScale(EndIndex) == + Subtarget.expandVScale(InterSubVT.getVectorElementCount())) Policy = RISCVII::TAIL_AGNOSTIC; // If we're inserting into the lowest elements, use a tail undisturbed diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 85f8f5f654fe7..c880c9e921e0e 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -200,6 +200,17 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { return Min; } + /// If the ElementCount or TypeSize \p X is scalable and VScale (VLEN) is + /// exactly known, returns \p X converted to a fixed quantity. Otherwise + /// returns \p X unmodified. + template Quantity expandVScale(Quantity X) const { + if (auto VLen = getRealVLen(); VLen && X.isScalable()) { + const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock; + X = Quantity::getFixed(X.getKnownMinValue() * VScale); + } + return X; + } + RISCVABI::ABI getTargetABI() const { return TargetABI; } bool isSoftFPABI() const { return TargetABI == RISCVABI::ABI_LP64 || From 3cee43edb7d311c678becd37f5568f5c295dc012 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 9 Apr 2024 07:13:57 +0800 Subject: [PATCH 3/5] Add asserts that the subvector size is a power of 2 This is an invariant needed if we want to check that the subvector exactly fills a vector register by checking if it's a multiple of vlen. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 ++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 278476c533038..42d523ae4788c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2105,6 +2105,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { bool ExactlyVecRegSized = Subtarget->expandVScale(SubVecVT.getSizeInBits()) .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize)); + assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits()) + .getKnownMinValue())); assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef())); #endif } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 122c475267048..bcda9ec781fc0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9861,6 +9861,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, } TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + assert(isPowerOf2_64( + Subtarget.expandVScale(SubVecVT.getSizeInBits()).getKnownMinValue())); bool ExactlyVecRegSized = Subtarget.expandVScale(SubVecVT.getSizeInBits()) .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize)); From 8d1f95f69213884471429ad912d28eea6ccc2cc5 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 25 Apr 2024 15:56:31 +0800 Subject: [PATCH 4/5] Add [[maybe_unused]], remove unused check prefix An unused check prefix apparently causes an error? Not sure if it was always like that. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 +- .../CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 42d523ae4788c..f41eb4d4ffb27 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2102,7 +2102,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); #ifndef NDEBUG TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); - bool ExactlyVecRegSized = + [[maybe_unused]] bool ExactlyVecRegSized = Subtarget->expandVScale(SubVecVT.getSizeInBits()) .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize)); assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits()) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index 76d6f8931d284..609b4e9824892 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { ; VLA-LABEL: concat_2xv4i32: From 29ed9cd819ee3fdc88edae07fdb46fa8c70e1a7b Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 30 Apr 2024 20:30:03 +0800 Subject: [PATCH 5/5] Remove NDEBUG gating around assertions --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index f41eb4d4ffb27..dc3ad5ac5908c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2100,7 +2100,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Establish the correct scalable-vector types for any fixed-length type. if (SubVecVT.isFixedLengthVector()) { SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); -#ifndef NDEBUG TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); [[maybe_unused]] bool ExactlyVecRegSized = Subtarget->expandVScale(SubVecVT.getSizeInBits()) @@ -2108,7 +2107,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits()) .getKnownMinValue())); assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef())); -#endif } MVT ContainerVT = VT; if (VT.isFixedLengthVector())