diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index 68dbe1ea3062a..c6779e258be7c 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -181,6 +181,18 @@ template class FixedOrScalableQuantity { return getKnownMinValue() % RHS == 0; } + /// Returns whether or not the callee is known to be a multiple of RHS. + constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const { + // x % y == 0 => x % y == 0 + // x % y == 0 => (vscale * x) % y == 0 + // x % y == 0 => (vscale * x) % (vscale * y) == 0 + // but + // x % y == 0 !=> x % (vscale * y) == 0 + if (!isScalable() && RHS.isScalable()) + return false; + return getKnownMinValue() % RHS.getKnownMinValue() == 0; + } + // Return the minimum value with the assumption that the count is exact. // Use in places where a scalable count doesn't make sense (e.g. non-vector // types, or vectors in backends which don't support scalable vectors). diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index da1543bd7112a..dc3ad5ac5908c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2099,8 +2099,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { MVT SubVecContainerVT = SubVecVT; // Establish the correct scalable-vector types for any fixed-length type. if (SubVecVT.isFixedLengthVector()) { - assert(Idx == 0 && V.isUndef()); SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); + TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + [[maybe_unused]] bool ExactlyVecRegSized = + Subtarget->expandVScale(SubVecVT.getSizeInBits()) + .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize)); + assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits()) + .getKnownMinValue())); + assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef())); } MVT ContainerVT = VT; if (VT.isFixedLengthVector()) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fe8edcf39681d..bcda9ec781fc0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9772,12 +9772,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, } } - // If the subvector vector is a fixed-length type, we cannot use subregister - // manipulation to simplify the codegen; we don't know which register of a - // LMUL group contains the specific subvector as we only know the minimum - // register size. Therefore we must slide the vector group up the full - // amount. - if (SubVecVT.isFixedLengthVector()) { + // If the subvector vector is a fixed-length type and we don't know VLEN + // exactly, we cannot use subregister manipulation to simplify the codegen; we + // don't know which register of a LMUL group contains the specific subvector + // as we only know the minimum register size. Therefore we must slide the + // vector group up the full amount. + const auto VLen = Subtarget.getRealVLen(); + if (SubVecVT.isFixedLengthVector() && !VLen) { if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector()) return Op; MVT ContainerVT = VecVT; @@ -9825,41 +9826,90 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, return DAG.getBitcast(Op.getValueType(), SubVec); } - unsigned SubRegIdx, RemIdx; - std::tie(SubRegIdx, RemIdx) = - RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( - VecVT, SubVecVT, OrigIdx, TRI); + MVT ContainerVecVT = VecVT; + if (VecVT.isFixedLengthVector()) { + ContainerVecVT = getContainerForFixedLengthVector(VecVT); + Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget); + } + + MVT ContainerSubVecVT = SubVecVT; + if (SubVecVT.isFixedLengthVector()) { + ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT); + SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget); + } + + unsigned SubRegIdx; + ElementCount RemIdx; + // insert_subvector scales the index by vscale if the subvector is scalable, + // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if + // we have a fixed length subvector, we need to adjust the index by 1/vscale. + if (SubVecVT.isFixedLengthVector()) { + assert(VLen); + unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; + auto Decompose = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI); + SubRegIdx = Decompose.first; + RemIdx = ElementCount::getFixed((Decompose.second * Vscale) + + (OrigIdx % Vscale)); + } else { + auto Decompose = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI); + SubRegIdx = Decompose.first; + RemIdx = ElementCount::getScalable(Decompose.second); + } - RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); - bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 || - SubVecLMUL == RISCVII::VLMUL::LMUL_F4 || - SubVecLMUL == RISCVII::VLMUL::LMUL_F8; + TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + assert(isPowerOf2_64( + Subtarget.expandVScale(SubVecVT.getSizeInBits()).getKnownMinValue())); + bool ExactlyVecRegSized = + Subtarget.expandVScale(SubVecVT.getSizeInBits()) + .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize)); // 1. If the Idx has been completely eliminated and this subvector's size is // a vector register or a multiple thereof, or the surrounding elements are // undef, then this is a subvector insert which naturally aligns to a vector // register. These can easily be handled using subregister manipulation. - // 2. If the subvector is smaller than a vector register, then the insertion - // must preserve the undisturbed elements of the register. We do this by - // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type - // (which resolves to a subregister copy), performing a VSLIDEUP to place the - // subvector within the vector register, and an INSERT_SUBVECTOR of that - // LMUL=1 type back into the larger vector (resolving to another subregister - // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type - // to avoid allocating a large register group to hold our subvector. - if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef())) + // 2. If the subvector isn't an exact multiple of a valid register group size, + // then the insertion must preserve the undisturbed elements of the register. + // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 + // vector type (which resolves to a subregister copy), performing a VSLIDEUP + // to place the subvector within the vector register, and an INSERT_SUBVECTOR + // of that LMUL=1 type back into the larger vector (resolving to another + // subregister operation). See below for how our VSLIDEUP works. We go via a + // LMUL=1 type to avoid allocating a large register group to hold our + // subvector. + if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) { + if (SubVecVT.isFixedLengthVector()) { + // We may get NoSubRegister if inserting at index 0 and the subvec + // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0 + if (SubRegIdx == RISCV::NoSubRegister) { + assert(OrigIdx == 0); + return Op; + } + + SDValue Insert = + DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec); + if (VecVT.isFixedLengthVector()) + Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget); + return Insert; + } return Op; + } // VSLIDEUP works by leaving elements 0 Quantity expandVScale(Quantity X) const { + if (auto VLen = getRealVLen(); VLen && X.isScalable()) { + const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock; + X = Quantity::getFixed(X.getKnownMinValue() * VScale); + } + return X; + } + RISCVABI::ABI getTargetABI() const { return TargetABI; } bool isSoftFPABI() const { return TargetABI == RISCVABI::ABI_LP64 || diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index ab6df1d3e883f..53de1a8755355 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -9,39 +9,63 @@ ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s define @insert_nxv8i32_v2i32_0( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_0: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma -; CHECK-NEXT: vmv.v.v v8, v12 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_0: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; VLA-NEXT: vmv.v.v v8, v12 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_0: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; VLS-NEXT: vmv.v.v v8, v12 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 0) ret %v } define @insert_nxv8i32_v2i32_2( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 2 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_2: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; VLA-NEXT: vslideup.vi v8, v12, 2 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_2: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v8, v12, 2 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 2) ret %v } define @insert_nxv8i32_v2i32_6( %vec, ptr %svp) { -; CHECK-LABEL: insert_nxv8i32_v2i32_6: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 6 -; CHECK-NEXT: ret +; VLA-LABEL: insert_nxv8i32_v2i32_6: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vle32.v v12, (a0) +; VLA-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; VLA-NEXT: vslideup.vi v8, v12, 6 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v2i32_6: +; VLS: # %bb.0: +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vle32.v v12, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v9, v12, 2 +; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 6) ret %v @@ -58,9 +82,7 @@ define @insert_nxv8i32_v8i32_0( %vec, ptr % ; ; VLS-LABEL: insert_nxv8i32_v8i32_0: ; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v12, (a0) -; VLS-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; VLS-NEXT: vmv.v.v v8, v12 +; VLS-NEXT: vl2re32.v v8, (a0) ; VLS-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) @@ -78,9 +100,7 @@ define @insert_nxv8i32_v8i32_8( %vec, ptr % ; ; VLS-LABEL: insert_nxv8i32_v8i32_8: ; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v12, (a0) -; VLS-NEXT: vsetivli zero, 16, e32, m4, tu, ma -; VLS-NEXT: vslideup.vi v8, v12, 8 +; VLS-NEXT: vl2re32.v v10, (a0) ; VLS-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 8) @@ -98,6 +118,31 @@ define @insert_nxv8i32_undef_v2i32_0(ptr %svp) { ret %v } +define @insert_nxv8i32_v4i32_0( %vec, <4 x i32> %subvec) { +; VLA-LABEL: insert_nxv8i32_v4i32_0: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; VLA-NEXT: vmv.v.v v8, v9 +; VLA-NEXT: ret +; +; VLS-LABEL: insert_nxv8i32_v4i32_0: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v8, v9 +; VLS-NEXT: ret + %v = call @llvm.vector.insert.nxv2i32.v4i32( %vec, <4 x i32> %subvec, i64 0) + ret %v +} + + +define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: insert_v4i32_v4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0) + ret <4 x i32> %v +} + define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v4i32_v2i32_0: ; VLA: # %bb.0: @@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) { ret void } +; This tests the code path in RISCVISelDAGToDAG::Select where we select an +; insert_subvector with a fixed vector and fixed subvector type. The phi here is +; used to prevent the fixed insert_subvector from being combined away into a +; scalable insert_subvector. +define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) { +; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: bnez a0, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: .LBB11_2: # %bar +; CHECK-NEXT: ret +entry: + br i1 %cond, label %foo, label %bar +foo: + %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0) + br label %bar +bar: + %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry] + ret <4 x i32> %w +} + + define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_0: ; VLA: # %bb.0: @@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vle32.v v8, (a1) ; VLS-NEXT: vl2re32.v v10, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; VLS-NEXT: vmv.v.v v10, v8 ; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret @@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; VLS-LABEL: insert_v8i32_v2i32_2: ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vle32.v v10, (a1) -; VLS-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; VLS-NEXT: vslideup.vi v8, v10, 2 -; VLS-NEXT: vs2r.v v8, (a0) +; VLS-NEXT: vle32.v v8, (a1) +; VLS-NEXT: vl2re32.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v10, v8, 2 +; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) { ; VLS-LABEL: insert_v8i32_v2i32_6: ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vle32.v v10, (a1) -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v8, v10, 6 -; VLS-NEXT: vs2r.v v8, (a0) +; VLS-NEXT: vle32.v v8, (a1) +; VLS-NEXT: vl2re32.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v11, v8, 2 +; VLS-NEXT: vs2r.v v10, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) { ; VLS: # %bb.0: ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vle32.v v8, (a1) -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v10, v8, 6 -; VLS-NEXT: vs2r.v v10, (a0) +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v9, v8, 2 +; VLS-NEXT: vs2r.v v8, (a0) ; VLS-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6) @@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) { ; VLS-LABEL: insert_v2i64_nxv16i64: ; VLS: # %bb.0: ; VLS-NEXT: vl1re64.v v8, (a0) -; VLS-NEXT: vl1re64.v v16, (a1) -; VLS-NEXT: vsetivli zero, 6, e64, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v16, 4 +; VLS-NEXT: vl1re64.v v10, (a1) ; VLS-NEXT: vs8r.v v8, (a2) ; VLS-NEXT: ret %sv0 = load <2 x i64>, ptr %psv0 @@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) { ; ; VLS-LABEL: insert_v2i64_nxv16i64_lo2: ; VLS: # %bb.0: -; VLS-NEXT: vl1re64.v v8, (a0) -; VLS-NEXT: vsetivli zero, 4, e64, m8, ta, ma -; VLS-NEXT: vslideup.vi v16, v8, 2 -; VLS-NEXT: vs8r.v v16, (a1) +; VLS-NEXT: vl1re64.v v9, (a0) +; VLS-NEXT: vs8r.v v8, (a1) ; VLS-NEXT: ret %sv = load <2 x i64>, ptr %psv %v = call @llvm.vector.insert.v2i64.nxv16i64( undef, <2 x i64> %sv, i64 2) @@ -633,7 +699,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret -; ; RV64-LABEL: insert_v2i64_nxv16i64_hi: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index 98e6b8f2dd760..609b4e9824892 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -1,30 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: concat_2xv4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_2xv4i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_2xv4i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %ab } define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { -; CHECK-LABEL: concat_4xv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_4xv2i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vslideup.vi v8, v9, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_4xv2i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v13, v10 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vmv1r.v v12, v8 +; VLS-NEXT: vslideup.vi v13, v11, 2 +; VLS-NEXT: vslideup.vi v12, v9, 2 +; VLS-NEXT: vmv2r.v v8, v12 +; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> @@ -32,21 +46,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x } define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) { -; CHECK-LABEL: concat_8xv1i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v14, v15, 1 -; CHECK-NEXT: vslideup.vi v12, v13, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 1 -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 -; CHECK-NEXT: ret +; VLA-LABEL: concat_8xv1i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vslideup.vi v14, v15, 1 +; VLA-NEXT: vslideup.vi v12, v13, 1 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 2 +; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 1 +; VLA-NEXT: vslideup.vi v8, v9, 1 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 4 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_8xv1i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v17, v12 +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vslideup.vi v14, v15, 1 +; VLS-NEXT: vmv1r.v v16, v8 +; VLS-NEXT: vslideup.vi v17, v13, 1 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v17, v14, 2 +; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLS-NEXT: vslideup.vi v10, v11, 1 +; VLS-NEXT: vslideup.vi v16, v9, 1 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v16, v10, 2 +; VLS-NEXT: vmv2r.v v8, v16 +; VLS-NEXT: ret %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> @@ -58,28 +89,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x } define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: concat_2xv8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_2xv8i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv2r.v v12, v10 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_2xv8i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %v } define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { -; CHECK-LABEL: concat_4xv4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v14, v11 -; CHECK-NEXT: vmv1r.v v12, v10 -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 4 -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_4xv4i32: +; VLA: # %bb.0: +; VLA-NEXT: vmv1r.v v14, v11 +; VLA-NEXT: vmv1r.v v12, v10 +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_4xv4i32: +; VLS: # %bb.0: +; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> @@ -87,21 +126,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x } define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) { -; CHECK-LABEL: concat_8xv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v14, v15, 2 -; CHECK-NEXT: vslideup.vi v12, v13, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: ret +; VLA-LABEL: concat_8xv2i32: +; VLA: # %bb.0: +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v14, v15, 2 +; VLA-NEXT: vslideup.vi v12, v13, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vslideup.vi v8, v9, 2 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: ret +; +; VLS-LABEL: concat_8xv2i32: +; VLS: # %bb.0: +; VLS-NEXT: vmv1r.v v19, v14 +; VLS-NEXT: vmv1r.v v18, v12 +; VLS-NEXT: vmv1r.v v17, v10 +; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vmv1r.v v16, v8 +; VLS-NEXT: vslideup.vi v19, v15, 2 +; VLS-NEXT: vslideup.vi v18, v13, 2 +; VLS-NEXT: vslideup.vi v17, v11, 2 +; VLS-NEXT: vslideup.vi v16, v9, 2 +; VLS-NEXT: vmv4r.v v8, v16 +; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> @@ -123,9 +176,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) { ; ; VLS-LABEL: concat_2xv16i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv4r.v v16, v12 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %ab @@ -147,14 +197,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x ; ; VLS-LABEL: concat_4xv8i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv2r.v v20, v14 -; VLS-NEXT: vmv2r.v v16, v12 -; VLS-NEXT: vmv2r.v v12, v10 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v16, v20, 8 -; VLS-NEXT: vslideup.vi v8, v12, 8 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> @@ -189,25 +231,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; ; VLS-LABEL: concat_8xv4i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv1r.v v18, v15 -; VLS-NEXT: vmv1r.v v20, v14 -; VLS-NEXT: vmv1r.v v22, v13 -; VLS-NEXT: vmv1r.v v16, v12 -; VLS-NEXT: vmv1r.v v14, v11 -; VLS-NEXT: vmv1r.v v12, v10 -; VLS-NEXT: vmv1r.v v10, v9 -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v20, v18, 4 -; VLS-NEXT: vslideup.vi v16, v22, 4 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v16, v20, 8 -; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLS-NEXT: vslideup.vi v12, v14, 4 -; VLS-NEXT: vslideup.vi v8, v10, 4 -; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLS-NEXT: vslideup.vi v8, v12, 8 -; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32>