diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a417b6fe05e59..62b6ac365e338 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3105,6 +3105,14 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL, return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops); } +static MVT getLMUL1VT(MVT VT) { + assert(VT.getVectorElementType().getSizeInBits() <= 64 && + "Unexpected vector MVT"); + return MVT::getScalableVectorVT( + VT.getVectorElementType(), + RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits()); +} + struct VIDSequence { int64_t StepNumerator; unsigned StepDenominator; @@ -3750,6 +3758,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget)) return Res; + // If we're compiling for an exact VLEN value, we can split our work per + // register in the register group. + const unsigned MinVLen = Subtarget.getRealMinVLen(); + const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) { + MVT ElemVT = VT.getVectorElementType(); + unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); + MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); + assert(M1VT == getLMUL1VT(M1VT)); + + // The following semantically builds up a fixed length concat_vector + // of the component build_vectors. We eagerly lower to scalable and + // insert_subvector here to avoid DAG combining it back to a large + // build_vector. + SmallVector BuildVectorOps(Op->op_begin(), Op->op_end()); + unsigned NumOpElts = M1VT.getVectorMinNumElements(); + SDValue Vec = DAG.getUNDEF(ContainerVT); + for (unsigned i = 0; i < VT.getVectorNumElements(); i += ElemsPerVReg) { + auto OneVRegOfOps = ArrayRef(BuildVectorOps).slice(i, ElemsPerVReg); + SDValue SubBV = + DAG.getNode(ISD::BUILD_VECTOR, DL, OneRegVT, OneVRegOfOps); + SubBV = convertToScalableVector(M1VT, SubBV, DAG, Subtarget); + unsigned InsertIdx = (i / ElemsPerVReg) * NumOpElts; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubBV, + DAG.getVectorIdxConstant(InsertIdx, DL)); + } + return convertFromScalableVector(VT, Vec, DAG, Subtarget); + } + // Cap the cost at a value linear to the number of elements in the vector. // The default lowering is to use the stack. The vector store + scalar loads // is linear in VL. However, at high lmuls vslide1down and vslidedown end up @@ -3944,14 +3983,6 @@ static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL, return splatSplitI64WithVL(DL, VT, Passthru, Scalar, VL, DAG); } -static MVT getLMUL1VT(MVT VT) { - assert(VT.getVectorElementType().getSizeInBits() <= 64 && - "Unexpected vector MVT"); - return MVT::getScalableVectorVT( - VT.getVectorElementType(), - RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits()); -} - // This function lowers an insert of a scalar operand Scalar into lane // 0 of the vector regardless of the value of VL. The contents of the // remaining lanes of the result vector are unspecified. VL is assumed diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 31ed3083e05a1..b8b41b9e4c916 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1080,206 +1080,156 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) { ; RV32-LABEL: buildvec_v32f64_exact_vlen: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -512 -; RV32-NEXT: .cfi_def_cfa_offset 512 -; RV32-NEXT: sw ra, 508(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 504(sp) # 4-byte Folded Spill -; RV32-NEXT: fsd fs0, 496(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs1, 488(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs2, 480(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs3, 472(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs4, 464(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs5, 456(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs6, 448(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs7, 440(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs8, 432(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs9, 424(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs10, 416(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs11, 408(sp) # 8-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset fs0, -16 -; RV32-NEXT: .cfi_offset fs1, -24 -; RV32-NEXT: .cfi_offset fs2, -32 -; RV32-NEXT: .cfi_offset fs3, -40 -; RV32-NEXT: .cfi_offset fs4, -48 -; RV32-NEXT: .cfi_offset fs5, -56 -; RV32-NEXT: .cfi_offset fs6, -64 -; RV32-NEXT: .cfi_offset fs7, -72 -; RV32-NEXT: .cfi_offset fs8, -80 -; RV32-NEXT: .cfi_offset fs9, -88 -; RV32-NEXT: .cfi_offset fs10, -96 -; RV32-NEXT: .cfi_offset fs11, -104 -; RV32-NEXT: addi s0, sp, 512 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -128 -; RV32-NEXT: sw a0, 120(sp) -; RV32-NEXT: sw a1, 124(sp) -; RV32-NEXT: fld ft0, 120(sp) -; RV32-NEXT: sw a2, 120(sp) -; RV32-NEXT: sw a3, 124(sp) -; RV32-NEXT: fld ft1, 120(sp) -; RV32-NEXT: sw a4, 120(sp) -; RV32-NEXT: sw a5, 124(sp) -; RV32-NEXT: fld ft2, 120(sp) -; RV32-NEXT: sw a6, 120(sp) -; RV32-NEXT: sw a7, 124(sp) -; RV32-NEXT: fld ft3, 120(sp) -; RV32-NEXT: fld ft4, 0(s0) -; RV32-NEXT: fld ft5, 8(s0) -; RV32-NEXT: fld ft6, 16(s0) -; RV32-NEXT: fld ft7, 24(s0) -; RV32-NEXT: fld ft8, 32(s0) -; RV32-NEXT: fld ft9, 40(s0) -; RV32-NEXT: fld ft10, 48(s0) -; RV32-NEXT: fld ft11, 56(s0) -; RV32-NEXT: fld fs0, 64(s0) -; RV32-NEXT: fld fs1, 72(s0) -; RV32-NEXT: fld fs2, 80(s0) -; RV32-NEXT: fld fs3, 88(s0) -; RV32-NEXT: fld fs4, 96(s0) -; RV32-NEXT: fld fs5, 104(s0) -; RV32-NEXT: fld fs6, 112(s0) -; RV32-NEXT: fld fs7, 120(s0) -; RV32-NEXT: fld fs8, 152(s0) -; RV32-NEXT: fld fs9, 144(s0) -; RV32-NEXT: fld fs10, 136(s0) -; RV32-NEXT: fld fs11, 128(s0) -; RV32-NEXT: fsd fs8, 248(sp) -; RV32-NEXT: fsd fs9, 240(sp) -; RV32-NEXT: fsd fs10, 232(sp) -; RV32-NEXT: fsd fs11, 224(sp) -; RV32-NEXT: fsd fs7, 216(sp) -; RV32-NEXT: fsd fs6, 208(sp) -; RV32-NEXT: fsd fs5, 200(sp) -; RV32-NEXT: fsd fs4, 192(sp) -; RV32-NEXT: fsd fs3, 184(sp) -; RV32-NEXT: fsd fs2, 176(sp) -; RV32-NEXT: fsd fs1, 168(sp) -; RV32-NEXT: fsd fs0, 160(sp) -; RV32-NEXT: fsd ft11, 152(sp) -; RV32-NEXT: fsd ft10, 144(sp) -; RV32-NEXT: fsd ft9, 136(sp) -; RV32-NEXT: fsd ft8, 128(sp) -; RV32-NEXT: fsd ft7, 376(sp) -; RV32-NEXT: fsd ft6, 368(sp) -; RV32-NEXT: fsd ft5, 360(sp) -; RV32-NEXT: fsd ft4, 352(sp) -; RV32-NEXT: fsd fa7, 312(sp) -; RV32-NEXT: fsd fa6, 304(sp) -; RV32-NEXT: fsd fa5, 296(sp) -; RV32-NEXT: fsd fa4, 288(sp) -; RV32-NEXT: fsd fa3, 280(sp) -; RV32-NEXT: fsd fa2, 272(sp) -; RV32-NEXT: fsd fa1, 264(sp) -; RV32-NEXT: fsd fa0, 256(sp) -; RV32-NEXT: fsd ft3, 344(sp) -; RV32-NEXT: fsd ft2, 336(sp) -; RV32-NEXT: fsd ft1, 328(sp) -; RV32-NEXT: fsd ft0, 320(sp) +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: fsd fs0, 24(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 16(sp) # 8-byte Folded Spill +; RV32-NEXT: .cfi_offset fs0, -8 +; RV32-NEXT: .cfi_offset fs1, -16 +; RV32-NEXT: sw a6, 8(sp) +; RV32-NEXT: sw a7, 12(sp) +; RV32-NEXT: fld ft4, 8(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: fld ft5, 8(sp) +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: fld ft6, 8(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: fld ft7, 8(sp) +; RV32-NEXT: fld ft0, 184(sp) +; RV32-NEXT: fld ft1, 168(sp) +; RV32-NEXT: fld ft2, 152(sp) +; RV32-NEXT: fld ft3, 136(sp) +; RV32-NEXT: fld ft8, 120(sp) +; RV32-NEXT: fld ft9, 104(sp) +; RV32-NEXT: fld ft10, 72(sp) +; RV32-NEXT: fld ft11, 88(sp) +; RV32-NEXT: fld fs0, 56(sp) +; RV32-NEXT: fld fs1, 40(sp) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vfmv.v.f v8, ft7 +; RV32-NEXT: vfslide1down.vf v12, v8, ft6 +; RV32-NEXT: vfmv.v.f v8, fa2 +; RV32-NEXT: vfslide1down.vf v9, v8, fa3 +; RV32-NEXT: vfmv.v.f v8, fa0 +; RV32-NEXT: vfslide1down.vf v8, v8, fa1 +; RV32-NEXT: vfmv.v.f v10, fa4 +; RV32-NEXT: vfslide1down.vf v10, v10, fa5 +; RV32-NEXT: vfmv.v.f v11, fa6 +; RV32-NEXT: vfslide1down.vf v11, v11, fa7 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vlse64.v v14, (a0), zero +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vlse64.v v15, (a0), zero +; RV32-NEXT: vfmv.v.f v13, ft5 +; RV32-NEXT: vfslide1down.vf v13, v13, ft4 +; RV32-NEXT: vfslide1down.vf v14, v14, fs1 +; RV32-NEXT: vfslide1down.vf v15, v15, fs0 +; RV32-NEXT: addi a0, sp, 80 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vlse64.v v18, (a0), zero +; RV32-NEXT: addi a0, sp, 96 +; RV32-NEXT: vlse64.v v19, (a0), zero +; RV32-NEXT: addi a0, sp, 112 +; RV32-NEXT: vlse64.v v20, (a0), zero +; RV32-NEXT: vfslide1down.vf v17, v16, ft11 +; RV32-NEXT: vfslide1down.vf v16, v18, ft10 +; RV32-NEXT: vfslide1down.vf v18, v19, ft9 +; RV32-NEXT: vfslide1down.vf v19, v20, ft8 ; RV32-NEXT: addi a0, sp, 128 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v16, (a0) -; RV32-NEXT: addi a0, sp, 256 -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi sp, s0, -512 -; RV32-NEXT: lw ra, 508(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 504(sp) # 4-byte Folded Reload -; RV32-NEXT: fld fs0, 496(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs1, 488(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs2, 480(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs3, 472(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs4, 464(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs5, 456(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs6, 448(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs7, 440(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs8, 432(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs9, 424(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs10, 416(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs11, 408(sp) # 8-byte Folded Reload -; RV32-NEXT: addi sp, sp, 512 +; RV32-NEXT: vlse64.v v20, (a0), zero +; RV32-NEXT: addi a0, sp, 144 +; RV32-NEXT: vlse64.v v21, (a0), zero +; RV32-NEXT: addi a0, sp, 160 +; RV32-NEXT: vlse64.v v22, (a0), zero +; RV32-NEXT: addi a0, sp, 176 +; RV32-NEXT: vlse64.v v23, (a0), zero +; RV32-NEXT: vfslide1down.vf v20, v20, ft3 +; RV32-NEXT: vfslide1down.vf v21, v21, ft2 +; RV32-NEXT: vfslide1down.vf v22, v22, ft1 +; RV32-NEXT: vfslide1down.vf v23, v23, ft0 +; RV32-NEXT: fld fs0, 24(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 16(sp) # 8-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_v32f64_exact_vlen: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -384 -; RV64-NEXT: .cfi_def_cfa_offset 384 -; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs0, 360(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs1, 352(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs2, 344(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs3, 336(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset fs0, -24 -; RV64-NEXT: .cfi_offset fs1, -32 -; RV64-NEXT: .cfi_offset fs2, -40 -; RV64-NEXT: .cfi_offset fs3, -48 -; RV64-NEXT: addi s0, sp, 384 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: fld ft0, 0(s0) -; RV64-NEXT: fld ft1, 8(s0) -; RV64-NEXT: fld ft2, 16(s0) -; RV64-NEXT: fld ft3, 24(s0) -; RV64-NEXT: fld ft4, 32(s0) -; RV64-NEXT: fld ft5, 40(s0) -; RV64-NEXT: fld ft6, 48(s0) -; RV64-NEXT: fld ft7, 56(s0) -; RV64-NEXT: fld ft8, 64(s0) -; RV64-NEXT: fld ft9, 72(s0) -; RV64-NEXT: fld ft10, 80(s0) -; RV64-NEXT: fld ft11, 88(s0) -; RV64-NEXT: fld fs0, 96(s0) -; RV64-NEXT: fld fs1, 104(s0) -; RV64-NEXT: fld fs2, 112(s0) -; RV64-NEXT: fld fs3, 120(s0) -; RV64-NEXT: sd a7, 248(sp) -; RV64-NEXT: sd a6, 240(sp) -; RV64-NEXT: sd a5, 232(sp) -; RV64-NEXT: sd a4, 224(sp) -; RV64-NEXT: sd a3, 216(sp) -; RV64-NEXT: sd a2, 208(sp) -; RV64-NEXT: sd a1, 200(sp) -; RV64-NEXT: sd a0, 192(sp) -; RV64-NEXT: fsd fa7, 184(sp) -; RV64-NEXT: fsd fa6, 176(sp) -; RV64-NEXT: fsd fa5, 168(sp) -; RV64-NEXT: fsd fa4, 160(sp) -; RV64-NEXT: fsd fa3, 152(sp) -; RV64-NEXT: fsd fa2, 144(sp) -; RV64-NEXT: fsd fa1, 136(sp) -; RV64-NEXT: fsd fa0, 128(sp) -; RV64-NEXT: fsd fs3, 120(sp) -; RV64-NEXT: fsd fs2, 112(sp) -; RV64-NEXT: fsd fs1, 104(sp) -; RV64-NEXT: fsd fs0, 96(sp) -; RV64-NEXT: fsd ft11, 88(sp) -; RV64-NEXT: fsd ft10, 80(sp) -; RV64-NEXT: fsd ft9, 72(sp) -; RV64-NEXT: fsd ft8, 64(sp) -; RV64-NEXT: fsd ft7, 56(sp) -; RV64-NEXT: fsd ft6, 48(sp) -; RV64-NEXT: fsd ft5, 40(sp) -; RV64-NEXT: fsd ft4, 32(sp) -; RV64-NEXT: fsd ft3, 24(sp) -; RV64-NEXT: fsd ft2, 16(sp) -; RV64-NEXT: fsd ft1, 8(sp) -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: fsd fs0, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs1, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs2, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs3, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset fs0, -8 +; RV64-NEXT: .cfi_offset fs1, -16 +; RV64-NEXT: .cfi_offset fs2, -24 +; RV64-NEXT: .cfi_offset fs3, -32 +; RV64-NEXT: fmv.d.x ft4, a7 +; RV64-NEXT: fmv.d.x ft5, a6 +; RV64-NEXT: fmv.d.x ft6, a5 +; RV64-NEXT: fmv.d.x ft7, a4 +; RV64-NEXT: fmv.d.x ft8, a3 +; RV64-NEXT: fmv.d.x ft9, a2 +; RV64-NEXT: fmv.d.x ft10, a1 +; RV64-NEXT: fmv.d.x ft11, a0 +; RV64-NEXT: fld ft0, 152(sp) +; RV64-NEXT: fld ft1, 136(sp) +; RV64-NEXT: fld ft2, 120(sp) +; RV64-NEXT: fld ft3, 104(sp) +; RV64-NEXT: fld fs0, 88(sp) +; RV64-NEXT: fld fs1, 72(sp) +; RV64-NEXT: fld fs2, 40(sp) +; RV64-NEXT: fld fs3, 56(sp) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vfmv.v.f v8, fa2 +; RV64-NEXT: vfslide1down.vf v9, v8, fa3 +; RV64-NEXT: vfmv.v.f v8, fa0 +; RV64-NEXT: vfslide1down.vf v8, v8, fa1 +; RV64-NEXT: vfmv.v.f v10, fa4 +; RV64-NEXT: vfslide1down.vf v10, v10, fa5 +; RV64-NEXT: vfmv.v.f v11, fa6 +; RV64-NEXT: vfslide1down.vf v11, v11, fa7 +; RV64-NEXT: vfmv.v.f v12, ft11 +; RV64-NEXT: vfslide1down.vf v12, v12, ft10 +; RV64-NEXT: vfmv.v.f v13, ft9 +; RV64-NEXT: vfslide1down.vf v13, v13, ft8 +; RV64-NEXT: vfmv.v.f v14, ft7 +; RV64-NEXT: vfslide1down.vf v14, v14, ft6 +; RV64-NEXT: vfmv.v.f v15, ft5 +; RV64-NEXT: vfslide1down.vf v15, v15, ft4 +; RV64-NEXT: addi a0, sp, 48 +; RV64-NEXT: vlse64.v v16, (a0), zero +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vlse64.v v18, (a0), zero +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vlse64.v v19, (a0), zero +; RV64-NEXT: addi a0, sp, 80 +; RV64-NEXT: vlse64.v v20, (a0), zero +; RV64-NEXT: vfslide1down.vf v17, v16, fs3 +; RV64-NEXT: vfslide1down.vf v16, v18, fs2 +; RV64-NEXT: vfslide1down.vf v18, v19, fs1 +; RV64-NEXT: vfslide1down.vf v19, v20, fs0 +; RV64-NEXT: addi a0, sp, 96 +; RV64-NEXT: vlse64.v v20, (a0), zero +; RV64-NEXT: addi a0, sp, 112 +; RV64-NEXT: vlse64.v v21, (a0), zero ; RV64-NEXT: addi a0, sp, 128 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vle64.v v16, (a0) -; RV64-NEXT: addi sp, s0, -384 -; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs0, 360(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs1, 352(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs2, 344(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs3, 336(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 384 +; RV64-NEXT: vlse64.v v22, (a0), zero +; RV64-NEXT: addi a0, sp, 144 +; RV64-NEXT: vlse64.v v23, (a0), zero +; RV64-NEXT: vfslide1down.vf v20, v20, ft3 +; RV64-NEXT: vfslide1down.vf v21, v21, ft2 +; RV64-NEXT: vfslide1down.vf v22, v22, ft1 +; RV64-NEXT: vfslide1down.vf v23, v23, ft0 +; RV64-NEXT: fld fs0, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs1, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs2, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs3, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret %v0 = insertelement <32 x double> poison, double %e0, i64 0 %v1 = insertelement <32 x double> %v0, double %e1, i64 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 25dfec2670486..5dfa3835cad02 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -852,24 +852,24 @@ define <8 x i32> @suffix_overwrite(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 % define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) { ; RV32-LABEL: v4xi64_exact: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v9, v8, a7 ; RV32-NEXT: vmv.v.x v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a4 -; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v8, v8, a6 -; RV32-NEXT: vslide1down.vx v8, v8, a7 ; RV32-NEXT: ret ; ; RV64-LABEL: v4xi64_exact: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vslide1down.vx v9, v8, a3 ; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 ; RV64-NEXT: ret %v1 = insertelement <4 x i64> poison, i64 %a, i32 0 %v2 = insertelement <4 x i64> %v1, i64 %b, i32 1 @@ -881,77 +881,43 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) vscale_range(2,2) { ; RV32-LABEL: v8xi64_exact: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s2, -12 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: lw t0, 0(s0) -; RV32-NEXT: lw t1, 4(s0) -; RV32-NEXT: lw t2, 8(s0) -; RV32-NEXT: lw t3, 12(s0) -; RV32-NEXT: lw t4, 28(s0) -; RV32-NEXT: lw t5, 24(s0) -; RV32-NEXT: lw t6, 20(s0) -; RV32-NEXT: lw s2, 16(s0) -; RV32-NEXT: sw t4, 60(sp) -; RV32-NEXT: sw t5, 56(sp) -; RV32-NEXT: sw t6, 52(sp) -; RV32-NEXT: sw s2, 48(sp) -; RV32-NEXT: sw t3, 44(sp) -; RV32-NEXT: sw t2, 40(sp) -; RV32-NEXT: sw t1, 36(sp) -; RV32-NEXT: sw t0, 32(sp) -; RV32-NEXT: sw a7, 28(sp) -; RV32-NEXT: sw a6, 24(sp) -; RV32-NEXT: sw a5, 20(sp) -; RV32-NEXT: sw a4, 16(sp) -; RV32-NEXT: sw a3, 12(sp) -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: lw t0, 28(sp) +; RV32-NEXT: lw t1, 24(sp) +; RV32-NEXT: lw t2, 20(sp) +; RV32-NEXT: lw t3, 12(sp) +; RV32-NEXT: lw t4, 8(sp) +; RV32-NEXT: lw t5, 4(sp) +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v9, v8, a7 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vlse32.v v10, (sp), zero +; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vlse32.v v11, (a0), zero +; RV32-NEXT: vslide1down.vx v10, v10, t5 +; RV32-NEXT: vslide1down.vx v10, v10, t4 +; RV32-NEXT: vslide1down.vx v10, v10, t3 +; RV32-NEXT: vslide1down.vx v11, v11, t2 +; RV32-NEXT: vslide1down.vx v11, v11, t1 +; RV32-NEXT: vslide1down.vx v11, v11, t0 ; RV32-NEXT: ret ; ; RV64-LABEL: v8xi64_exact: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: sd a7, 56(sp) -; RV64-NEXT: sd a6, 48(sp) -; RV64-NEXT: sd a5, 40(sp) -; RV64-NEXT: sd a4, 32(sp) -; RV64-NEXT: sd a3, 24(sp) -; RV64-NEXT: sd a2, 16(sp) -; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vslide1down.vx v9, v8, a3 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vmv.v.x v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a5 +; RV64-NEXT: vmv.v.x v11, a6 +; RV64-NEXT: vslide1down.vx v11, v11, a7 ; RV64-NEXT: ret %v1 = insertelement <8 x i64> poison, i64 %a, i32 0 %v2 = insertelement <8 x i64> %v1, i64 %b, i32 1 @@ -967,66 +933,28 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) { ; RV32-LABEL: v8xi64_exact_equal_halves: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: sw a7, 60(sp) -; RV32-NEXT: sw a6, 56(sp) -; RV32-NEXT: sw a5, 52(sp) -; RV32-NEXT: sw a4, 48(sp) -; RV32-NEXT: sw a3, 44(sp) -; RV32-NEXT: sw a2, 40(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a0, 32(sp) -; RV32-NEXT: sw a7, 28(sp) -; RV32-NEXT: sw a6, 24(sp) -; RV32-NEXT: sw a5, 20(sp) -; RV32-NEXT: sw a4, 16(sp) -; RV32-NEXT: sw a3, 12(sp) -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v9, v8, a7 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vmv.v.v v10, v8 +; RV32-NEXT: vmv.v.v v11, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: v8xi64_exact_equal_halves: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: sd a3, 56(sp) -; RV64-NEXT: sd a2, 48(sp) -; RV64-NEXT: sd a1, 40(sp) -; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: sd a3, 24(sp) -; RV64-NEXT: sd a2, 16(sp) -; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vslide1down.vx v9, v8, a3 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vmv.v.v v10, v8 +; RV64-NEXT: vmv.v.v v11, v9 ; RV64-NEXT: ret %v1 = insertelement <8 x i64> poison, i64 %a, i32 0 %v2 = insertelement <8 x i64> %v1, i64 %b, i32 1 @@ -1042,54 +970,24 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) { ; RV32-LABEL: v8xi64_exact_undef_suffix: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: sw a7, 28(sp) -; RV32-NEXT: sw a6, 24(sp) -; RV32-NEXT: sw a5, 20(sp) -; RV32-NEXT: sw a4, 16(sp) -; RV32-NEXT: sw a3, 12(sp) -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v9, v8, a7 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: v8xi64_exact_undef_suffix: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: sd a3, 24(sp) -; RV64-NEXT: sd a2, 16(sp) -; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vslide1down.vx v9, v8, a3 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: ret %v1 = insertelement <8 x i64> poison, i64 %a, i32 0 %v2 = insertelement <8 x i64> %v1, i64 %b, i32 1 @@ -1101,54 +999,24 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) { ; RV32-LABEL: v8xi64_exact_undef_prefix: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: sw a7, 60(sp) -; RV32-NEXT: sw a6, 56(sp) -; RV32-NEXT: sw a5, 52(sp) -; RV32-NEXT: sw a4, 48(sp) -; RV32-NEXT: sw a3, 44(sp) -; RV32-NEXT: sw a2, 40(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a0, 32(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v11, v8, a7 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v10, v8, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: v8xi64_exact_undef_prefix: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: sd a3, 56(sp) -; RV64-NEXT: sd a2, 48(sp) -; RV64-NEXT: sd a1, 40(sp) -; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vslide1down.vx v11, v8, a3 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v10, v8, a1 ; RV64-NEXT: ret %v1 = insertelement <8 x i64> poison, i64 %a, i32 4 %v2 = insertelement <8 x i64> %v1, i64 %b, i32 5