From 44892e524edb3041ce6ca8b74baeba568668b9ab Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 16 Oct 2024 14:45:43 +0000 Subject: [PATCH 1/9] [AArch64][SVE] Avoid transfer to GPRs for fp -> int -> fp conversions When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to #112213. --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 35 ++++++++++++++ .../sve-streaming-mode-cvt-fp-int-fp.ll | 46 ++++++++++++------- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 59859cb7442d5..b99f2ee7e1b48 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2421,6 +2421,41 @@ let Predicates = [HasSVEorSME] in { defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; } // End HasSVEorSME +// Helper for creating fp -> int -> fp conversions using SVE. +class sve_fp_int_fp_cvt + : OutPatFrag<(ops node: $Rn), + (EXTRACT_SUBREG + (FROM_INT (IMPLICIT_DEF), (PTRUE 1), + (TO_INT (IMPLICIT_DEF), (PTRUE 1), + (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>; + +// Some float -> int -> float conversion patterns where we want to keep the int +// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR +// register transfers. Only used when NEON is not available (e.g. in streaming +// functions). +// TODO: When +sme2p2 is available single-element vectors should be preferred. +def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">; +let Predicates = [HasSVEorSME, HasNoNEON] in { +def : Pat< + (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +def : Pat< + (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +def : Pat< + (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +def : Pat< + (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +def : Pat< + (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +def : Pat< + (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))), + (sve_fp_int_fp_cvt $Rn)>; +} // End HasSVEorSME, HasNoNEON + let Predicates = [HasBF16, HasSVEorSME] in { defm BFDOT_ZZZ : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>; defm BFDOT_ZZI : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index f402463de7be8..0acc107280ac8 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -8,8 +8,11 @@ target triple = "aarch64-unknown-linux-gnu" define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs x8, d0 -; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t1: @@ -26,8 +29,11 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs w8, s0 -; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t2: @@ -44,10 +50,11 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fcvtzs w8, s0 -; CHECK-NEXT: scvtf s0, w8 -; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t3: @@ -66,8 +73,11 @@ entry: define double @t4(double %x) { ; CHECK-LABEL: t4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu x8, d0 -; CHECK-NEXT: ucvtf d0, x8 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t4: @@ -84,8 +94,11 @@ entry: define float @t5(float %x) { ; CHECK-LABEL: t5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu w8, s0 -; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t5: @@ -102,10 +115,11 @@ entry: define half @t6(half %x) { ; CHECK-LABEL: t6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fcvtzu w8, s0 -; CHECK-NEXT: ucvtf s0, w8 -; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t6: From 68540edf58bc4e0814203658f72d2572e0f4413a Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 16 Oct 2024 15:22:57 +0000 Subject: [PATCH 2/9] Note that this is for scalars --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index b99f2ee7e1b48..10ea3ad8392ff 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2421,39 +2421,40 @@ let Predicates = [HasSVEorSME] in { defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; } // End HasSVEorSME -// Helper for creating fp -> int -> fp conversions using SVE. -class sve_fp_int_fp_cvt +// Helper for creating scalar fp -> int -> fp conversions using SVE. +class sve_scalar_fp_int_fp_cvt + : OutPatFrag<(ops node: $Rn), (EXTRACT_SUBREG (FROM_INT (IMPLICIT_DEF), (PTRUE 1), (TO_INT (IMPLICIT_DEF), (PTRUE 1), (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>; -// Some float -> int -> float conversion patterns where we want to keep the int -// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR -// register transfers. Only used when NEON is not available (e.g. in streaming -// functions). -// TODO: When +sme2p2 is available single-element vectors should be preferred. +// Some scalar float -> int -> float conversion patterns where we want to keep +// the int values in FP registers to avoid costly GPR <-> FPR register +// transfers using SVE instructions. Only used when NEON is not available (e.g. +// in streaming functions). +// TODO: When +sme2p2 is available Neon single-element vectors should be preferred. def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">; let Predicates = [HasSVEorSME, HasNoNEON] in { def : Pat< (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; def : Pat< (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; def : Pat< (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; def : Pat< (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; def : Pat< (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; def : Pat< (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))), - (sve_fp_int_fp_cvt $Rn)>; + (sve_scalar_fp_int_fp_cvt $Rn)>; } // End HasSVEorSME, HasNoNEON let Predicates = [HasBF16, HasSVEorSME] in { From 56b94106c57aaca221674e7872a59b1ea6b2949e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 21 Oct 2024 21:11:44 +0000 Subject: [PATCH 3/9] Lower scalar FP converts to SVE --- .../Target/AArch64/AArch64ISelLowering.cpp | 98 ++++- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 40 +- .../sve-streaming-mode-cvt-fp-int-fp.ll | 20 +- .../sve-streaming-mode-cvt-fp-to-int.ll | 264 +++++++++++++ .../sve-streaming-mode-cvt-int-to-fp.ll | 265 +++++++++++++ ...e-streaming-mode-fixed-length-fp-to-int.ll | 366 ++++++++---------- ...e-streaming-mode-fixed-length-int-to-fp.ll | 121 ++++-- 7 files changed, 880 insertions(+), 294 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5a848ada9dd8e..ab329a6dc7908 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1454,8 +1454,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); @@ -2138,6 +2142,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::FP_ROUND, VT, Default); setOperationAction(ISD::FP_TO_SINT, VT, Default); setOperationAction(ISD::FP_TO_UINT, VT, Default); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Default); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Default); setOperationAction(ISD::FRINT, VT, Default); setOperationAction(ISD::LRINT, VT, Default); setOperationAction(ISD::LLRINT, VT, Default); @@ -2164,6 +2170,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::SIGN_EXTEND, VT, Default); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default); setOperationAction(ISD::SINT_TO_FP, VT, Default); + setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Default); setOperationAction(ISD::SMAX, VT, Default); setOperationAction(ISD::SMIN, VT, Default); setOperationAction(ISD::SPLAT_VECTOR, VT, Default); @@ -2174,6 +2181,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::TRUNCATE, VT, Default); setOperationAction(ISD::UDIV, VT, Default); setOperationAction(ISD::UINT_TO_FP, VT, Default); + setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Default); setOperationAction(ISD::UMAX, VT, Default); setOperationAction(ISD::UMIN, VT, Default); setOperationAction(ISD::VECREDUCE_ADD, VT, Default); @@ -4550,9 +4558,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, EVT VT = Op.getValueType(); if (VT.isScalableVector()) { - unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT - ? AArch64ISD::FCVTZU_MERGE_PASSTHRU - : AArch64ISD::FCVTZS_MERGE_PASSTHRU; + unsigned Opc = Op.getOpcode(); + bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT; + unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU + : AArch64ISD::FCVTZU_MERGE_PASSTHRU; return LowerToPredicatedOp(Op, DAG, Opcode); } @@ -4628,6 +4637,51 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, return Op; } +static bool CanLowerToScalarSVEFPIntConversion(EVT VT) { + if (!VT.isSimple()) + return false; + // There are SVE instructions that can convert to/from all pairs of these int + // and float types. Note: We don't bother with i8 or i16 as those are illegal + // types for scalars. + return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64}, + VT.getSimpleVT().SimpleTy); +} + +/// Lowers a scalar FP conversion (to/from) int to SVE. +static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcTy = SrcVal.getValueType(); + EVT DestTy = Op.getValueType(); + EVT SrcVecTy; + EVT DestVecTy; + // Use a packed vector for the larger type. + // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that + // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as + // (unlike floats) nxv2i32 is an illegal unpacked type. + if (DestTy.bitsGT(SrcTy)) { + DestVecTy = getPackedSVEVectorVT(DestTy); + SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy) + : DestVecTy.changeVectorElementType(SrcTy); + } else { + SrcVecTy = getPackedSVEVectorVT(SrcTy); + DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy) + : SrcVecTy.changeVectorElementType(DestTy); + } + SDLoc dl(Op); + SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy, + DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); + Vec = IsStrict ? DAG.getNode(Op.getOpcode(), dl, {DestVecTy, MVT::Other}, + {Op.getOperand(0), Vec}) + : DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec); + SDValue Scalar = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, ZeroIdx); + if (IsStrict) + return DAG.getMergeValues({Scalar, Vec.getValue(1)}, dl); + return Scalar; +} + SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -4636,6 +4690,12 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); + if (!Subtarget->isNeonAvailable() && + Subtarget->isSVEorStreamingSVEAvailable() && + CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && + CanLowerToScalarSVEFPIntConversion(Op.getValueType())) + return LowerScalarFPConversionToSVE(Op, DAG); + // f16 conversions are promoted to f32 when full fp16 is not supported. if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVal.getValueType() == MVT::bf16) { @@ -4939,6 +4999,12 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + if (!Subtarget->isNeonAvailable() && + Subtarget->isSVEorStreamingSVEAvailable() && + CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && + CanLowerToScalarSVEFPIntConversion(Op.getValueType())) + return LowerScalarFPConversionToSVE(Op, DAG); + bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP || Op->getOpcode() == ISD::SINT_TO_FP; @@ -28327,7 +28393,21 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); - auto Pg = getPredicateForVector(DAG, DL, VT); + SDValue Pg; + + // FCVTZS_ZPmZ_DtoS and FCVTZU_ZPmZ_DtoS are special cases. These operations + // return nxv4i32 rather than the correct nxv2i32, as nxv2i32 is an illegal + // unpacked type. So, in this case, we take the predicate size from the + // operand. + SDValue LastOp{}; + if ((NewOp == AArch64ISD::FCVTZU_MERGE_PASSTHRU || + NewOp == AArch64ISD::FCVTZS_MERGE_PASSTHRU) && + VT == MVT::nxv4i32 && + (LastOp = Op->ops().back().get()).getValueType() == MVT::nxv2f64) { + Pg = getPredicateForVector(DAG, DL, LastOp.getValueType()); + } else { + Pg = getPredicateForVector(DAG, DL, VT); + } if (VT.isFixedLengthVector()) { assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); @@ -28363,7 +28443,12 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); SmallVector Operands = {Pg}; + SDValue Chain{}; for (const SDValue &V : Op->op_values()) { + if (!isa(V) && V.getValueType() == MVT::Other) { + Chain = V; + continue; + } assert((!V.getValueType().isVector() || V.getValueType().isScalableVector()) && "Only scalable vectors are supported!"); @@ -28373,7 +28458,10 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, if (isMergePassthruOpcode(NewOp)) Operands.push_back(DAG.getUNDEF(VT)); - return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); + auto NewNode = DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); + if (Chain) + return DAG.getMergeValues({NewNode, Chain}, DL); + return NewNode; } // If a fixed length vector operation has no side effects when applied to diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 10ea3ad8392ff..dfff9c627540b 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2338,8 +2338,8 @@ let Predicates = [HasSVEorSME] in { defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, AArch64fcvtzs_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, AArch64fcvtzu_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; @@ -2421,42 +2421,6 @@ let Predicates = [HasSVEorSME] in { defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; } // End HasSVEorSME -// Helper for creating scalar fp -> int -> fp conversions using SVE. -class sve_scalar_fp_int_fp_cvt - - : OutPatFrag<(ops node: $Rn), - (EXTRACT_SUBREG - (FROM_INT (IMPLICIT_DEF), (PTRUE 1), - (TO_INT (IMPLICIT_DEF), (PTRUE 1), - (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>; - -// Some scalar float -> int -> float conversion patterns where we want to keep -// the int values in FP registers to avoid costly GPR <-> FPR register -// transfers using SVE instructions. Only used when NEON is not available (e.g. -// in streaming functions). -// TODO: When +sme2p2 is available Neon single-element vectors should be preferred. -def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">; -let Predicates = [HasSVEorSME, HasNoNEON] in { -def : Pat< - (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -def : Pat< - (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -def : Pat< - (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -def : Pat< - (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -def : Pat< - (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -def : Pat< - (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))), - (sve_scalar_fp_int_fp_cvt $Rn)>; -} // End HasSVEorSME, HasNoNEON - let Predicates = [HasBF16, HasSVEorSME] in { defm BFDOT_ZZZ : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>; defm BFDOT_ZZI : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index 0acc107280ac8..0f4cb2060f249 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu" define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d @@ -29,7 +29,7 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s @@ -50,10 +50,10 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h -; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; @@ -73,7 +73,7 @@ entry: define double @t4(double %x) { ; CHECK-LABEL: t4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d @@ -94,7 +94,7 @@ entry: define float @t5(float %x) { ; CHECK-LABEL: t5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s @@ -115,10 +115,10 @@ entry: define half @t6(half %x) { ; CHECK-LABEL: t6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll new file mode 100644 index 0000000000000..60d3124f5b21e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE + +target triple = "aarch64-unknown-linux-gnu" + +define i32 @f16_to_s32(half %x) { +; CHECK-LABEL: f16_to_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f16_to_s32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi half %x to i32 + ret i32 %cvt +} + +define i64 @f16_to_s64(half %x) { +; CHECK-LABEL: f16_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f16_to_s64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi half %x to i64 + ret i64 %cvt +} + +define i32 @f32_to_s32(float %x) { +; CHECK-LABEL: f32_to_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f32_to_s32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs w0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi float %x to i32 + ret i32 %cvt +} + +define i64 @f32_to_s64(float %x) { +; CHECK-LABEL: f32_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f32_to_s64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs x0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi float %x to i64 + ret i64 %cvt +} + +define i32 @f64_to_s32(double %x) { +; CHECK-LABEL: f64_to_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f64_to_s32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs w0, d0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi double %x to i32 + ret i32 %cvt +} + +define i64 @f64_to_s64(double %x) { +; CHECK-LABEL: f64_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f64_to_s64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs x0, d0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptosi double %x to i64 + ret i64 %cvt +} + +define i32 @f16_to_u32(half %x) { +; CHECK-LABEL: f16_to_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f16_to_u32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui half %x to i32 + ret i32 %cvt +} + +define i64 @f16_to_u64(half %x) { +; CHECK-LABEL: f16_to_u64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f16_to_u64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui half %x to i64 + ret i64 %cvt +} + +define i32 @f32_to_u32(float %x) { +; CHECK-LABEL: f32_to_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f32_to_u32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu w0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui float %x to i32 + ret i32 %cvt +} + +define i64 @f32_to_u64(float %x) { +; CHECK-LABEL: f32_to_u64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f32_to_u64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu x0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui float %x to i64 + ret i64 %cvt +} + +define i32 @f64_to_u32(double %x) { +; CHECK-LABEL: f64_to_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f64_to_u32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu w0, d0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui double %x to i32 + ret i32 %cvt +} + +define i64 @f64_to_u64(double %x) { +; CHECK-LABEL: f64_to_u64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: f64_to_u64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu x0, d0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = fptoui double %x to i64 + ret i64 %cvt +} + +define i32 @strict_convert_signed(double %x) { +; CHECK-LABEL: strict_convert_signed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: strict_convert_signed: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs w0, d0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict") #0 + ret i32 %cvt +} + +define i32 @strict_convert_unsigned(float %x) { +; CHECK-LABEL: strict_convert_unsigned: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: strict_convert_unsigned: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu w0, s0 +; NONEON-NOSVE-NEXT: ret + entry: + %cvt = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict") #0 + ret i32 %cvt +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll new file mode 100644 index 0000000000000..42be60ad55970 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE + +target triple = "aarch64-unknown-linux-gnu" + +define half @s32_to_f16(i32 %x) { +; CHECK-LABEL: s32_to_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s32_to_f16: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf s0, w0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i32 %x to half + ret half %cvt +} + +define float @s32_to_f32(i32 %x) { +; CHECK-LABEL: s32_to_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s32_to_f32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf s0, w0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i32 %x to float + ret float %cvt +} + +define double @s32_to_f64(i32 %x) { +; CHECK-LABEL: s32_to_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s32_to_f64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf d0, w0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i32 %x to double + ret double %cvt +} + +define half @u32_to_f16(i32 %x) { +; CHECK-LABEL: u32_to_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u32_to_f16: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf s0, w0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i32 %x to half + ret half %cvt +} + +define float @u32_to_f32(i32 %x) { +; CHECK-LABEL: u32_to_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u32_to_f32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf s0, w0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i32 %x to float + ret float %cvt +} + +define double @u32_to_f64(i32 %x) { +; CHECK-LABEL: u32_to_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u32_to_f64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf d0, w0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i32 %x to double + ret double %cvt +} + +define half @s64_to_f16(i64 %x) { +; CHECK-LABEL: s64_to_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.h, p0/m, z0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s64_to_f16: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf s0, x0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i64 %x to half + ret half %cvt +} + +define float @s64_to_f32(i64 %x) { +; CHECK-LABEL: s64_to_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s64_to_f32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf s0, x0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i64 %x to float + ret float %cvt +} + +define double @s64_to_f64(i64 %x) { +; CHECK-LABEL: s64_to_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: s64_to_f64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf d0, x0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = sitofp i64 %x to double + ret double %cvt +} + +define half @u64_to_f16(i64 %x) { +; CHECK-LABEL: u64_to_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u64_to_f16: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf s0, x0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i64 %x to half + ret half %cvt +} + +define float @u64_to_f32(i64 %x) { +; CHECK-LABEL: u64_to_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u64_to_f32: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf s0, x0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i64 %x to float + ret float %cvt +} + +define double @u64_to_f64(i64 %x) { +; CHECK-LABEL: u64_to_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: u64_to_f64: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf d0, x0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = uitofp i64 %x to double + ret double %cvt +} + +define half @strict_convert_signed(i32 %x) { +; CHECK-LABEL: strict_convert_signed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: strict_convert_signed: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: scvtf s0, w0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %cvt +} + +define float @strict_convert_unsigned(i64 %x) { +; CHECK-LABEL: strict_convert_unsigned: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: strict_convert_unsigned: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ucvtf s0, x0 +; NONEON-NOSVE-NEXT: ret +entry: + %cvt = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %cvt +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 11fee267660c0..5e162fbfef196 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -418,8 +418,10 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) { ; CHECK-LABEL: fcvtzu_v1f16_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64: @@ -441,10 +443,9 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h ; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret @@ -472,20 +473,17 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.h, z0.h[3] ; CHECK-NEXT: mov z2.h, z0.h[2] ; CHECK-NEXT: mov z3.h, z0.h[1] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: fcvtzu x11, h3 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: zip1 z0.d, z1.d, z0.d -; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h +; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h ; CHECK-NEXT: zip1 z1.d, z2.d, z1.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: @@ -522,36 +520,29 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v8f16_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z2.h, z0.h[3] ; CHECK-NEXT: mov z3.h, z0.h[2] ; CHECK-NEXT: mov z4.h, z0.h[1] -; CHECK-NEXT: fcvtzu x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: fcvtzu x8, h2 -; CHECK-NEXT: fcvtzu x9, h3 -; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzu z4.d, p0/m, z4.h ; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: mov z6.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzu x14, h1 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d3, x11 -; CHECK-NEXT: fcvtzu x12, h5 -; CHECK-NEXT: fcvtzu x13, h6 -; CHECK-NEXT: fcvtzu x15, h2 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: zip1 z0.d, z1.d, z0.d -; CHECK-NEXT: fmov d1, x12 -; CHECK-NEXT: fmov d4, x13 -; CHECK-NEXT: zip1 z2.d, z2.d, z3.d -; CHECK-NEXT: fmov d3, x14 -; CHECK-NEXT: zip1 z1.d, z4.d, z1.d -; CHECK-NEXT: fmov d4, x15 -; CHECK-NEXT: stp q2, q0, [x1] -; CHECK-NEXT: zip1 z3.d, z3.d, z4.d -; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: mov z7.h, z1.h[1] +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: zip1 z0.d, z0.d, z4.d +; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h +; CHECK-NEXT: fcvtzu z7.d, p0/m, z7.h +; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: zip1 z3.d, z6.d, z5.d +; CHECK-NEXT: zip1 z1.d, z1.d, z7.d +; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: @@ -604,67 +595,54 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z5.d, z1.d -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z4.h, z1.h[1] -; CHECK-NEXT: mov z6.h, z1.h[3] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: mov z7.h, z0.h[1] -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 -; CHECK-NEXT: fcvtzu x10, h2 -; CHECK-NEXT: fcvtzu x11, h4 -; CHECK-NEXT: fcvtzu x12, h6 -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fmov d16, x9 -; CHECK-NEXT: mov z2.h, z3.h[3] -; CHECK-NEXT: mov z4.h, z5.h[3] -; CHECK-NEXT: fcvtzu x14, h3 -; CHECK-NEXT: fcvtzu x13, h1 -; CHECK-NEXT: fcvtzu x15, h5 -; CHECK-NEXT: mov z1.h, z3.h[1] -; CHECK-NEXT: mov z6.h, z5.h[1] -; CHECK-NEXT: mov z5.h, z5.h[2] -; CHECK-NEXT: mov z3.h, z3.h[2] -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: fcvtzu x10, h4 -; CHECK-NEXT: fmov d4, x11 -; CHECK-NEXT: fcvtzu x11, h7 -; CHECK-NEXT: fmov d7, x12 -; CHECK-NEXT: fcvtzu x12, h0 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: fcvtzu x13, h1 -; CHECK-NEXT: fmov d1, x14 -; CHECK-NEXT: fcvtzu x14, h6 -; CHECK-NEXT: fmov d6, x15 -; CHECK-NEXT: fcvtzu x15, h5 -; CHECK-NEXT: fmov d5, x9 -; CHECK-NEXT: fcvtzu x9, h3 -; CHECK-NEXT: zip1 z4.d, z16.d, z4.d -; CHECK-NEXT: fmov d16, x8 -; CHECK-NEXT: zip1 z0.d, z0.d, z7.d -; CHECK-NEXT: fmov d3, x12 -; CHECK-NEXT: fmov d7, x10 -; CHECK-NEXT: stp q4, q0, [x1, #64] -; CHECK-NEXT: fmov d0, x14 -; CHECK-NEXT: fmov d4, x9 -; CHECK-NEXT: zip1 z2.d, z3.d, z2.d -; CHECK-NEXT: fmov d3, x11 -; CHECK-NEXT: zip1 z0.d, z6.d, z0.d -; CHECK-NEXT: zip1 z4.d, z4.d, z5.d -; CHECK-NEXT: zip1 z3.d, z16.d, z3.d -; CHECK-NEXT: fmov d16, x15 -; CHECK-NEXT: stp q3, q2, [x1] -; CHECK-NEXT: fmov d2, x13 -; CHECK-NEXT: zip1 z7.d, z16.d, z7.d -; CHECK-NEXT: zip1 z1.d, z1.d, z2.d -; CHECK-NEXT: stp q0, q7, [x1, #96] -; CHECK-NEXT: stp q1, q4, [x1, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z3.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z1.h +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z17.h, z0.h[1] +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzu z4.d, p0/m, z4.h +; CHECK-NEXT: fcvtzu z17.d, p0/m, z17.h +; CHECK-NEXT: fcvtzu z7.d, p0/m, z7.h +; CHECK-NEXT: mov z20.h, z1.h[3] +; CHECK-NEXT: mov z18.h, z16.h[3] +; CHECK-NEXT: mov z19.h, z16.h[2] +; CHECK-NEXT: mov z21.h, z16.h[1] +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: mov z3.h, z1.h[2] +; CHECK-NEXT: zip1 z5.d, z6.d, z5.d +; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: zip1 z0.d, z0.d, z17.d +; CHECK-NEXT: fcvtzu z16.d, p0/m, z16.h +; CHECK-NEXT: fcvtzu z18.d, p0/m, z18.h +; CHECK-NEXT: movprfx z17, z21 +; CHECK-NEXT: fcvtzu z17.d, p0/m, z21.h +; CHECK-NEXT: fcvtzu z19.d, p0/m, z19.h +; CHECK-NEXT: zip1 z4.d, z7.d, z4.d +; CHECK-NEXT: movprfx z7, z20 +; CHECK-NEXT: fcvtzu z7.d, p0/m, z20.h +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h +; CHECK-NEXT: stp q0, q5, [x1, #64] +; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h +; CHECK-NEXT: zip1 z0.d, z19.d, z18.d +; CHECK-NEXT: zip1 z5.d, z16.d, z17.d +; CHECK-NEXT: stp q2, q4, [x1] +; CHECK-NEXT: zip1 z2.d, z3.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z6.d +; CHECK-NEXT: stp q5, q0, [x1, #96] +; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: @@ -1186,7 +1164,10 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; CHECK-LABEL: fcvtzu_v1f64_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2135,8 +2116,10 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) { ; CHECK-LABEL: fcvtzs_v1f16_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64: @@ -2159,10 +2142,9 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h ; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret @@ -2190,20 +2172,17 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.h, z0.h[3] ; CHECK-NEXT: mov z2.h, z0.h[2] ; CHECK-NEXT: mov z3.h, z0.h[1] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: fcvtzs x11, h3 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: zip1 z0.d, z1.d, z0.d -; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.h +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h ; CHECK-NEXT: zip1 z1.d, z2.d, z1.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: @@ -2240,36 +2219,29 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v8f16_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z2.h, z0.h[3] ; CHECK-NEXT: mov z3.h, z0.h[2] ; CHECK-NEXT: mov z4.h, z0.h[1] -; CHECK-NEXT: fcvtzs x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: fcvtzs x8, h2 -; CHECK-NEXT: fcvtzs x9, h3 -; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.h +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.h ; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: mov z6.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzs x14, h1 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d3, x11 -; CHECK-NEXT: fcvtzs x12, h5 -; CHECK-NEXT: fcvtzs x13, h6 -; CHECK-NEXT: fcvtzs x15, h2 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: zip1 z0.d, z1.d, z0.d -; CHECK-NEXT: fmov d1, x12 -; CHECK-NEXT: fmov d4, x13 -; CHECK-NEXT: zip1 z2.d, z2.d, z3.d -; CHECK-NEXT: fmov d3, x14 -; CHECK-NEXT: zip1 z1.d, z4.d, z1.d -; CHECK-NEXT: fmov d4, x15 -; CHECK-NEXT: stp q2, q0, [x1] -; CHECK-NEXT: zip1 z3.d, z3.d, z4.d -; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: mov z7.h, z1.h[1] +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: zip1 z0.d, z0.d, z4.d +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.h +; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: zip1 z3.d, z6.d, z5.d +; CHECK-NEXT: zip1 z1.d, z1.d, z7.d +; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: @@ -2322,67 +2294,54 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z5.d, z1.d -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z4.h, z1.h[1] -; CHECK-NEXT: mov z6.h, z1.h[3] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: mov z7.h, z0.h[1] -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 -; CHECK-NEXT: fcvtzs x10, h2 -; CHECK-NEXT: fcvtzs x11, h4 -; CHECK-NEXT: fcvtzs x12, h6 -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fmov d16, x9 -; CHECK-NEXT: mov z2.h, z3.h[3] -; CHECK-NEXT: mov z4.h, z5.h[3] -; CHECK-NEXT: fcvtzs x14, h3 -; CHECK-NEXT: fcvtzs x13, h1 -; CHECK-NEXT: fcvtzs x15, h5 -; CHECK-NEXT: mov z1.h, z3.h[1] -; CHECK-NEXT: mov z6.h, z5.h[1] -; CHECK-NEXT: mov z5.h, z5.h[2] -; CHECK-NEXT: mov z3.h, z3.h[2] -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: fmov d4, x11 -; CHECK-NEXT: fcvtzs x11, h7 -; CHECK-NEXT: fmov d7, x12 -; CHECK-NEXT: fcvtzs x12, h0 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: fcvtzs x13, h1 -; CHECK-NEXT: fmov d1, x14 -; CHECK-NEXT: fcvtzs x14, h6 -; CHECK-NEXT: fmov d6, x15 -; CHECK-NEXT: fcvtzs x15, h5 -; CHECK-NEXT: fmov d5, x9 -; CHECK-NEXT: fcvtzs x9, h3 -; CHECK-NEXT: zip1 z4.d, z16.d, z4.d -; CHECK-NEXT: fmov d16, x8 -; CHECK-NEXT: zip1 z0.d, z0.d, z7.d -; CHECK-NEXT: fmov d3, x12 -; CHECK-NEXT: fmov d7, x10 -; CHECK-NEXT: stp q4, q0, [x1, #64] -; CHECK-NEXT: fmov d0, x14 -; CHECK-NEXT: fmov d4, x9 -; CHECK-NEXT: zip1 z2.d, z3.d, z2.d -; CHECK-NEXT: fmov d3, x11 -; CHECK-NEXT: zip1 z0.d, z6.d, z0.d -; CHECK-NEXT: zip1 z4.d, z4.d, z5.d -; CHECK-NEXT: zip1 z3.d, z16.d, z3.d -; CHECK-NEXT: fmov d16, x15 -; CHECK-NEXT: stp q3, q2, [x1] -; CHECK-NEXT: fmov d2, x13 -; CHECK-NEXT: zip1 z7.d, z16.d, z7.d -; CHECK-NEXT: zip1 z1.d, z1.d, z2.d -; CHECK-NEXT: stp q0, q7, [x1, #96] -; CHECK-NEXT: stp q1, q4, [x1, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z3.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.h +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z17.h, z0.h[1] +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.h +; CHECK-NEXT: fcvtzs z17.d, p0/m, z17.h +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.h +; CHECK-NEXT: mov z20.h, z1.h[3] +; CHECK-NEXT: mov z18.h, z16.h[3] +; CHECK-NEXT: mov z19.h, z16.h[2] +; CHECK-NEXT: mov z21.h, z16.h[1] +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: mov z3.h, z1.h[2] +; CHECK-NEXT: zip1 z5.d, z6.d, z5.d +; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: zip1 z0.d, z0.d, z17.d +; CHECK-NEXT: fcvtzs z16.d, p0/m, z16.h +; CHECK-NEXT: fcvtzs z18.d, p0/m, z18.h +; CHECK-NEXT: movprfx z17, z21 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z21.h +; CHECK-NEXT: fcvtzs z19.d, p0/m, z19.h +; CHECK-NEXT: zip1 z4.d, z7.d, z4.d +; CHECK-NEXT: movprfx z7, z20 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z20.h +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h +; CHECK-NEXT: stp q0, q5, [x1, #64] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h +; CHECK-NEXT: zip1 z0.d, z19.d, z18.d +; CHECK-NEXT: zip1 z5.d, z16.d, z17.d +; CHECK-NEXT: stp q2, q4, [x1] +; CHECK-NEXT: zip1 z2.d, z3.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z6.d +; CHECK-NEXT: stp q5, q0, [x1, #96] +; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: @@ -2906,7 +2865,10 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; CHECK-LABEL: fcvtzs_v1f64_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index e595686cb4975..24ad0f502dbf3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -390,8 +390,11 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: ucvtf d0, w8 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64: @@ -1142,10 +1145,9 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: ucvtf h0, x8 -; CHECK-NEXT: ucvtf h1, x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d +; CHECK-NEXT: ucvtf z1.h, p0/m, z1.d ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2596,10 +2598,9 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: scvtf h0, x8 -; CHECK-NEXT: scvtf h1, x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.h, p0/m, z0.d +; CHECK-NEXT: scvtf z1.h, p0/m, z1.d ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2795,7 +2796,10 @@ define half @scvtf_i16_f16(ptr %0) { ; CHECK-LABEL: scvtf_i16_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i16_f16: @@ -2813,7 +2817,10 @@ define float @scvtf_i16_f32(ptr %0) { ; CHECK-LABEL: scvtf_i16_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i16_f32: @@ -2830,7 +2837,10 @@ define double @scvtf_i16_f64(ptr %0) { ; CHECK-LABEL: scvtf_i16_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i16_f64: @@ -2846,8 +2856,10 @@ define double @scvtf_i16_f64(ptr %0) { define half @scvtf_i32_f16(ptr %0) { ; CHECK-LABEL: scvtf_i32_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i32_f16: @@ -2864,8 +2876,10 @@ define half @scvtf_i32_f16(ptr %0) { define float @scvtf_i32_f32(ptr %0) { ; CHECK-LABEL: scvtf_i32_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i32_f32: @@ -2881,8 +2895,10 @@ define float @scvtf_i32_f32(ptr %0) { define double @scvtf_i32_f64(ptr %0) { ; CHECK-LABEL: scvtf_i32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i32_f64: @@ -2898,8 +2914,10 @@ define double @scvtf_i32_f64(ptr %0) { define half @scvtf_i64_f16(ptr %0) { ; CHECK-LABEL: scvtf_i64_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: scvtf h0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: scvtf z0.h, p0/m, z0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i64_f16: @@ -2916,8 +2934,10 @@ define half @scvtf_i64_f16(ptr %0) { define float @scvtf_i64_f32(ptr %0) { ; CHECK-LABEL: scvtf_i64_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i64_f32: @@ -2933,8 +2953,10 @@ define float @scvtf_i64_f32(ptr %0) { define double @scvtf_i64_f64(ptr %0) { ; CHECK-LABEL: scvtf_i64_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_i64_f64: @@ -2951,7 +2973,10 @@ define half @ucvtf_i16_f16(ptr %0) { ; CHECK-LABEL: ucvtf_i16_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f16: @@ -2969,7 +2994,10 @@ define float @ucvtf_i16_f32(ptr %0) { ; CHECK-LABEL: ucvtf_i16_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f32: @@ -2986,7 +3014,10 @@ define double @ucvtf_i16_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i16_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ucvtf d0, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f64: @@ -3002,8 +3033,10 @@ define double @ucvtf_i16_f64(ptr %0) { define half @ucvtf_i32_f16(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i32_f16: @@ -3020,8 +3053,10 @@ define half @ucvtf_i32_f16(ptr %0) { define float @ucvtf_i32_f32(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i32_f32: @@ -3037,8 +3072,10 @@ define float @ucvtf_i32_f32(ptr %0) { define double @ucvtf_i32_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ucvtf d0, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i32_f64: @@ -3054,8 +3091,10 @@ define double @ucvtf_i32_f64(ptr %0) { define half @ucvtf_i64_f16(ptr %0) { ; CHECK-LABEL: ucvtf_i64_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ucvtf h0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i64_f16: @@ -3072,8 +3111,10 @@ define half @ucvtf_i64_f16(ptr %0) { define float @ucvtf_i64_f32(ptr %0) { ; CHECK-LABEL: ucvtf_i64_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i64_f32: @@ -3089,8 +3130,10 @@ define float @ucvtf_i64_f32(ptr %0) { define double @ucvtf_i64_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i64_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ucvtf d0, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i64_f64: From f59876f5258b3b3bbcddb7d9477e325cc5408151 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 23 Oct 2024 09:06:18 +0000 Subject: [PATCH 4/9] Remove strict converts --- .../Target/AArch64/AArch64ISelLowering.cpp | 37 ++++--------------- .../sve-streaming-mode-cvt-fp-to-int.ll | 10 +---- .../sve-streaming-mode-cvt-int-to-fp.ll | 12 ++---- 3 files changed, 13 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ab329a6dc7908..7e8153534076d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1454,12 +1454,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::SINT_TO_FP, VT, Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); @@ -2142,8 +2138,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::FP_ROUND, VT, Default); setOperationAction(ISD::FP_TO_SINT, VT, Default); setOperationAction(ISD::FP_TO_UINT, VT, Default); - setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Default); - setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Default); setOperationAction(ISD::FRINT, VT, Default); setOperationAction(ISD::LRINT, VT, Default); setOperationAction(ISD::LLRINT, VT, Default); @@ -2170,7 +2164,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::SIGN_EXTEND, VT, Default); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default); setOperationAction(ISD::SINT_TO_FP, VT, Default); - setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Default); setOperationAction(ISD::SMAX, VT, Default); setOperationAction(ISD::SMIN, VT, Default); setOperationAction(ISD::SPLAT_VECTOR, VT, Default); @@ -2181,7 +2174,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::TRUNCATE, VT, Default); setOperationAction(ISD::UDIV, VT, Default); setOperationAction(ISD::UINT_TO_FP, VT, Default); - setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Default); setOperationAction(ISD::UMAX, VT, Default); setOperationAction(ISD::UMIN, VT, Default); setOperationAction(ISD::VECREDUCE_ADD, VT, Default); @@ -4649,8 +4641,8 @@ static bool CanLowerToScalarSVEFPIntConversion(EVT VT) { /// Lowers a scalar FP conversion (to/from) int to SVE. static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) { - bool IsStrict = Op->isStrictFPOpcode(); - SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + assert(!Op->isStrictFPOpcode() && "strict fp ops not supported"); + SDValue SrcVal = Op.getOperand(0); EVT SrcTy = SrcVal.getValueType(); EVT DestTy = Op.getValueType(); EVT SrcVecTy; @@ -4672,14 +4664,9 @@ static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) { SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy, DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); - Vec = IsStrict ? DAG.getNode(Op.getOpcode(), dl, {DestVecTy, MVT::Other}, - {Op.getOperand(0), Vec}) - : DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec); - SDValue Scalar = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, ZeroIdx); - if (IsStrict) - return DAG.getMergeValues({Scalar, Vec.getValue(1)}, dl); - return Scalar; + Vec = DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + ZeroIdx); } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, @@ -4690,7 +4677,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (!Subtarget->isNeonAvailable() && + if (!IsStrict && !Subtarget->isNeonAvailable() && Subtarget->isSVEorStreamingSVEAvailable() && CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && CanLowerToScalarSVEFPIntConversion(Op.getValueType())) @@ -4999,7 +4986,7 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); - if (!Subtarget->isNeonAvailable() && + if (!IsStrict && !Subtarget->isNeonAvailable() && Subtarget->isSVEorStreamingSVEAvailable() && CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && CanLowerToScalarSVEFPIntConversion(Op.getValueType())) @@ -28443,12 +28430,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); SmallVector Operands = {Pg}; - SDValue Chain{}; for (const SDValue &V : Op->op_values()) { - if (!isa(V) && V.getValueType() == MVT::Other) { - Chain = V; - continue; - } assert((!V.getValueType().isVector() || V.getValueType().isScalableVector()) && "Only scalable vectors are supported!"); @@ -28458,10 +28440,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, if (isMergePassthruOpcode(NewOp)) Operands.push_back(DAG.getUNDEF(VT)); - auto NewNode = DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); - if (Chain) - return DAG.getMergeValues({NewNode, Chain}, DL); - return NewNode; + return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); } // If a fixed length vector operation has no side effects when applied to diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll index 60d3124f5b21e..300ccefc71c91 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll @@ -228,10 +228,7 @@ define i64 @f64_to_u64(double %x) { define i32 @strict_convert_signed(double %x) { ; CHECK-LABEL: strict_convert_signed: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fcvtzs w0, d0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_signed: @@ -246,10 +243,7 @@ define i32 @strict_convert_signed(double %x) { define i32 @strict_convert_unsigned(float %x) { ; CHECK-LABEL: strict_convert_unsigned: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fcvtzu w0, s0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_unsigned: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll index 42be60ad55970..0fc0d9cda4e63 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve,+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -228,10 +228,7 @@ entry: define half @strict_convert_signed(i32 %x) { ; CHECK-LABEL: strict_convert_signed: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: scvtf z0.h, p0/m, z0.s -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: scvtf h0, w0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_signed: @@ -247,10 +244,7 @@ entry: define float @strict_convert_unsigned(i64 %x) { ; CHECK-LABEL: strict_convert_unsigned: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ucvtf s0, x0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_unsigned: From 3a5683a9d37f3e8570a8b1481c907e65393be878 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 24 Oct 2024 11:14:40 +0000 Subject: [PATCH 5/9] Move to DAGCombine + fixups --- .../Target/AArch64/AArch64ISelLowering.cpp | 116 +++++++++--------- .../sve-streaming-mode-cvt-fp-int-fp.ll | 48 +++++++- .../sve-streaming-mode-cvt-int-to-fp.ll | 11 +- 3 files changed, 111 insertions(+), 64 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7e8153534076d..1aaf8a79218d4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4550,10 +4550,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, EVT VT = Op.getValueType(); if (VT.isScalableVector()) { - unsigned Opc = Op.getOpcode(); - bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT; - unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU - : AArch64ISD::FCVTZU_MERGE_PASSTHRU; + unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT + ? AArch64ISD::FCVTZU_MERGE_PASSTHRU + : AArch64ISD::FCVTZS_MERGE_PASSTHRU; return LowerToPredicatedOp(Op, DAG, Opcode); } @@ -4629,46 +4628,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, return Op; } -static bool CanLowerToScalarSVEFPIntConversion(EVT VT) { - if (!VT.isSimple()) - return false; - // There are SVE instructions that can convert to/from all pairs of these int - // and float types. Note: We don't bother with i8 or i16 as those are illegal - // types for scalars. - return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64}, - VT.getSimpleVT().SimpleTy); -} - -/// Lowers a scalar FP conversion (to/from) int to SVE. -static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) { - assert(!Op->isStrictFPOpcode() && "strict fp ops not supported"); - SDValue SrcVal = Op.getOperand(0); - EVT SrcTy = SrcVal.getValueType(); - EVT DestTy = Op.getValueType(); - EVT SrcVecTy; - EVT DestVecTy; - // Use a packed vector for the larger type. - // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that - // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as - // (unlike floats) nxv2i32 is an illegal unpacked type. - if (DestTy.bitsGT(SrcTy)) { - DestVecTy = getPackedSVEVectorVT(DestTy); - SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy) - : DestVecTy.changeVectorElementType(SrcTy); - } else { - SrcVecTy = getPackedSVEVectorVT(SrcTy); - DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy) - : SrcVecTy.changeVectorElementType(DestTy); - } - SDLoc dl(Op); - SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); - SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy, - DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); - Vec = DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - ZeroIdx); -} - SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -4677,12 +4636,6 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (!IsStrict && !Subtarget->isNeonAvailable() && - Subtarget->isSVEorStreamingSVEAvailable() && - CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && - CanLowerToScalarSVEFPIntConversion(Op.getValueType())) - return LowerScalarFPConversionToSVE(Op, DAG); - // f16 conversions are promoted to f32 when full fp16 is not supported. if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVal.getValueType() == MVT::bf16) { @@ -4986,12 +4939,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); - if (!IsStrict && !Subtarget->isNeonAvailable() && - Subtarget->isSVEorStreamingSVEAvailable() && - CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) && - CanLowerToScalarSVEFPIntConversion(Op.getValueType())) - return LowerScalarFPConversionToSVE(Op, DAG); - bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP || Op->getOpcode() == ISD::SINT_TO_FP; @@ -19014,6 +18961,57 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } +static bool +shouldUseSVEForScalarFPConversion(SDNode *N, + const AArch64Subtarget *Subtarget) { + auto isSupportedType = [](EVT VT) { + if (!VT.isSimple()) + return false; + // There are SVE instructions that can convert to/from all pairs of these + // int and float types. Note: We don't bother with i8 or i16 as those are + // illegal types for scalars. + return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64}, + VT.getSimpleVT().SimpleTy); + }; + // If we are in a streaming[-compatible] function, use SVE for scalar FP <-> + // INT conversions as this can help avoid movs between GPRs and FPRs, which + // could be quite expensive. + return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() && + (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) && + isSupportedType(N->getValueType(0)) && + isSupportedType(N->getOperand(0).getValueType()); +} + +/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped +/// with an insert and extract. +static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) { + assert(!N->isStrictFPOpcode() && "strict fp ops not supported"); + SDValue SrcVal = N->getOperand(0); + EVT SrcTy = SrcVal.getValueType(); + EVT DestTy = N->getValueType(0); + EVT SrcVecTy; + EVT DestVecTy; + // Use a packed vector for the larger type. + // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that + // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as + // (unlike floats) nxv2i32 is an illegal unpacked type. + if (DestTy.bitsGT(SrcTy)) { + DestVecTy = getPackedSVEVectorVT(DestTy); + SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy) + : DestVecTy.changeVectorElementType(SrcTy); + } else { + SrcVecTy = getPackedSVEVectorVT(SrcTy); + DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy) + : SrcVecTy.changeVectorElementType(DestTy); + } + SDLoc dl(N); + SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy, + DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); + Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx); +} + static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from @@ -19021,6 +19019,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; + if (shouldUseSVEForScalarFPConversion(N, Subtarget)) + return replaceScalarFPConversionWithSVE(N, DAG); + EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); @@ -19059,6 +19060,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { + if (shouldUseSVEForScalarFPConversion(N, Subtarget)) + return replaceScalarFPConversionWithSVE(N, DAG); + if (!Subtarget->isNeonAvailable()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index 0f4cb2060f249..1050dc0210a67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -force-streaming-compatible < %s | FileCheck %s -; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS -; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS +; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS +; RUN: llc -mattr=+neon < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -20,6 +22,12 @@ define double @t1(double %x) { ; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0 ; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t1: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi double %x to i64 %conv1 = sitofp i64 %conv to double @@ -41,6 +49,12 @@ define float @t2(float %x) { ; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0 ; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t2: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi float %x to i32 %conv1 = sitofp i32 %conv to float @@ -64,6 +78,14 @@ define half @t3(half %x) { ; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0 ; USE-NEON-NO-GPRS-NEXT: fcvt h0, s0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t3: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi half %x to i32 %conv1 = sitofp i32 %conv to half @@ -85,6 +107,12 @@ define double @t4(double %x) { ; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0 ; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t4: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptoui double %x to i64 %conv1 = uitofp i64 %conv to double @@ -106,6 +134,12 @@ define float @t5(float %x) { ; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0 ; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t5: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptoui float %x to i32 %conv1 = uitofp i32 %conv to float @@ -129,6 +163,14 @@ define half @t6(half %x) { ; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0 ; USE-NEON-NO-GPRS-NEXT: fcvt h0, s0 ; USE-NEON-NO-GPRS-NEXT: ret +; +; NONEON-NOSVE-LABEL: t6: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret entry: %conv = fptoui half %x to i32 %conv1 = uitofp i32 %conv to half diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll index 0fc0d9cda4e63..61049478850c0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -; RUN: llc -mattr=+sve,+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,FORCE-STREAMING ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -226,10 +226,11 @@ entry: } define half @strict_convert_signed(i32 %x) { -; CHECK-LABEL: strict_convert_signed: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf h0, w0 -; CHECK-NEXT: ret +; FORCE-STREAMING-LABEL: strict_convert_signed: +; FORCE-STREAMING: // %bb.0: // %entry +; FORCE-STREAMING-NEXT: scvtf s0, w0 +; FORCE-STREAMING-NEXT: fcvt h0, s0 +; FORCE-STREAMING-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_signed: ; NONEON-NOSVE: // %bb.0: // %entry From 8a37e17d6a0f86079c521f344d6cf392cdf8993c Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 24 Oct 2024 11:25:38 +0000 Subject: [PATCH 6/9] Tweak test --- .../sve-streaming-mode-cvt-int-to-fp.ll | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll index 61049478850c0..0a00ce69587b0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,FORCE-STREAMING +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -225,21 +225,19 @@ entry: ret double %cvt } -define half @strict_convert_signed(i32 %x) { -; FORCE-STREAMING-LABEL: strict_convert_signed: -; FORCE-STREAMING: // %bb.0: // %entry -; FORCE-STREAMING-NEXT: scvtf s0, w0 -; FORCE-STREAMING-NEXT: fcvt h0, s0 -; FORCE-STREAMING-NEXT: ret +define float @strict_convert_signed(i32 %x) { +; CHECK-LABEL: strict_convert_signed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: scvtf s0, w0 +; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: strict_convert_signed: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: scvtf s0, w0 -; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: ret entry: - %cvt = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 - ret half %cvt + %cvt = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %cvt } define float @strict_convert_unsigned(i64 %x) { From 64335db5b4c70dfa7639f0626a167d506486d2c2 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 24 Oct 2024 15:14:37 +0000 Subject: [PATCH 7/9] WIP --- .../Target/AArch64/AArch64ISelLowering.cpp | 120 ++++++++++-------- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +- .../sve-streaming-mode-cvt-fp-to-int.ll | 10 +- .../sve-streaming-mode-cvt-int-to-fp.ll | 11 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 4 +- ...e-streaming-mode-fixed-length-int-to-fp.ll | 24 ++-- 6 files changed, 99 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1aaf8a79218d4..f29605f62ee6f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18961,9 +18961,39 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -static bool -shouldUseSVEForScalarFPConversion(SDNode *N, - const AArch64Subtarget *Subtarget) { +/// Creates a scalar FP <-> INT conversion with a scalable one, wrapped +/// with an insert and extract. +static SDValue createScalarSVEFPConversion(SelectionDAG &DAG, unsigned Opc, + SDLoc DL, SDValue SrcVal, EVT SrcTy, + EVT DestTy) { + EVT SrcVecTy; + EVT DestVecTy; + if (DestTy.bitsGT(SrcTy)) { + DestVecTy = getPackedSVEVectorVT(DestTy); + SrcVecTy = DestVecTy.changeVectorElementType(SrcTy); + } else { + SrcVecTy = getPackedSVEVectorVT(SrcTy); + DestVecTy = SrcVecTy.changeVectorElementType(DestTy); + } + SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy, + DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); + Vec = DAG.getNode(Opc, DL, DestVecTy, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx); +} + +/// Tries to replace scalar FP <-> conversions with SVE in streaming functions. +static SDValue +tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + // Uncomment to introduce extra fcvts. + // if (DCI.isBeforeLegalizeOps()) + // return SDValue(); + + if (N->isStrictFPOpcode()) + return SDValue(); + auto isSupportedType = [](EVT VT) { if (!VT.isSimple()) return false; @@ -18973,54 +19003,52 @@ shouldUseSVEForScalarFPConversion(SDNode *N, return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64}, VT.getSimpleVT().SimpleTy); }; + + if (!isSupportedType(N->getValueType(0)) || + !isSupportedType(N->getOperand(0).getValueType())) + return SDValue(); + // If we are in a streaming[-compatible] function, use SVE for scalar FP <-> - // INT conversions as this can help avoid movs between GPRs and FPRs, which + // INT conversions as this can help avoid moves between GPRs and FPRs, which // could be quite expensive. - return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() && - (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) && - isSupportedType(N->getValueType(0)) && - isSupportedType(N->getOperand(0).getValueType()); -} + if (!Subtarget->isSVEorStreamingSVEAvailable() || + (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible())) + return SDValue(); -/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped -/// with an insert and extract. -static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) { - assert(!N->isStrictFPOpcode() && "strict fp ops not supported"); + SDLoc DL(N); + unsigned Opc = N->getOpcode(); SDValue SrcVal = N->getOperand(0); EVT SrcTy = SrcVal.getValueType(); EVT DestTy = N->getValueType(0); - EVT SrcVecTy; - EVT DestVecTy; - // Use a packed vector for the larger type. - // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that - // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as - // (unlike floats) nxv2i32 is an illegal unpacked type. - if (DestTy.bitsGT(SrcTy)) { - DestVecTy = getPackedSVEVectorVT(DestTy); - SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy) - : DestVecTy.changeVectorElementType(SrcTy); - } else { - SrcVecTy = getPackedSVEVectorVT(SrcTy); - DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy) - : SrcVecTy.changeVectorElementType(DestTy); + + // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal + // type (unlike the equivalent nxv2f32 for floating-point types). + // May materialize extra instructions :( + if (SrcTy == MVT::i32 && DestTy == MVT::f64) { + SDValue ExtSrc = DAG.getNode(Opc == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + DL, MVT::i64, SrcVal); + return createScalarSVEFPConversion(DAG, Opc, DL, ExtSrc, MVT::i64, + MVT::f64); } - SDLoc dl(N); - SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); - SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy, - DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); - Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx); + if (SrcTy == MVT::f64 && DestTy == MVT::i32) { + SDValue ExtDest = + createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, MVT::f64, MVT::i64); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ExtDest); + } + return createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, SrcTy, DestTy); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; - if (shouldUseSVEForScalarFPConversion(N, Subtarget)) - return replaceScalarFPConversionWithSVE(N, DAG); + if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget)) + return Res; EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) @@ -19060,8 +19088,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - if (shouldUseSVEForScalarFPConversion(N, Subtarget)) - return replaceScalarFPConversionWithSVE(N, DAG); + if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget)) + return Res; if (!Subtarget->isNeonAvailable()) return SDValue(); @@ -26082,7 +26110,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG, Subtarget); + return performIntToFpCombine(N, DAG, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FP_TO_SINT_SAT: @@ -28384,21 +28412,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); - SDValue Pg; - - // FCVTZS_ZPmZ_DtoS and FCVTZU_ZPmZ_DtoS are special cases. These operations - // return nxv4i32 rather than the correct nxv2i32, as nxv2i32 is an illegal - // unpacked type. So, in this case, we take the predicate size from the - // operand. - SDValue LastOp{}; - if ((NewOp == AArch64ISD::FCVTZU_MERGE_PASSTHRU || - NewOp == AArch64ISD::FCVTZS_MERGE_PASSTHRU) && - VT == MVT::nxv4i32 && - (LastOp = Op->ops().back().get()).getValueType() == MVT::nxv2f64) { - Pg = getPredicateForVector(DAG, DL, LastOp.getValueType()); - } else { - Pg = getPredicateForVector(DAG, DL, VT); - } + auto Pg = getPredicateForVector(DAG, DL, VT); if (VT.isFixedLengthVector()) { assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index dfff9c627540b..7824905244905 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2328,8 +2328,8 @@ let Predicates = [HasSVEorSME] in { defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>; defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>; defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>; - defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; @@ -2338,8 +2338,8 @@ let Predicates = [HasSVEorSME] in { defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, AArch64fcvtzs_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, AArch64fcvtzu_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll index 300ccefc71c91..6ef5a0b985b59 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll @@ -84,8 +84,9 @@ define i32 @f64_to_s32(double %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: f64_to_s32: @@ -194,8 +195,9 @@ define i32 @f64_to_u32(double %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.d -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: f64_to_u32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll index 0a00ce69587b0..59b6a4a69e5d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -45,9 +45,11 @@ entry: define double @s32_to_f64(i32 %x) { ; CHECK-LABEL: s32_to_f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -100,9 +102,10 @@ entry: define double @u32_to_f64(i32 %x) { ; CHECK-LABEL: u32_to_f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 5e162fbfef196..4add5d8a23ac9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -1166,7 +1166,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -2867,7 +2867,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 24ad0f502dbf3..18d4209bb76e4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -392,8 +392,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2836,10 +2836,10 @@ define float @scvtf_i16_f32(ptr %0) { define double @scvtf_i16_f64(ptr %0) { ; CHECK-LABEL: scvtf_i16_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: ldrsh x8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2895,9 +2895,10 @@ define float @scvtf_i32_f32(ptr %0) { define double @scvtf_i32_f64(ptr %0) { ; CHECK-LABEL: scvtf_i32_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldrsw x8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: scvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -3015,8 +3016,8 @@ define double @ucvtf_i16_f64(ptr %0) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -3072,9 +3073,10 @@ define float @ucvtf_i32_f32(ptr %0) { define double @ucvtf_i32_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; From b81e8db670a3cfdbcb2da5c07849bd8330a4aee5 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 24 Oct 2024 16:54:27 +0000 Subject: [PATCH 8/9] Use intrinsics --- .../Target/AArch64/AArch64ISelLowering.cpp | 89 +++++++++---------- .../sve-streaming-mode-cvt-fp-to-int.ll | 10 +-- .../sve-streaming-mode-cvt-int-to-fp.ll | 11 +-- ...e-streaming-mode-fixed-length-fp-to-int.ll | 4 +- ...e-streaming-mode-fixed-length-int-to-fp.ll | 24 +++-- 5 files changed, 65 insertions(+), 73 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f29605f62ee6f..2b9420dd10003 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18961,36 +18961,10 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -/// Creates a scalar FP <-> INT conversion with a scalable one, wrapped -/// with an insert and extract. -static SDValue createScalarSVEFPConversion(SelectionDAG &DAG, unsigned Opc, - SDLoc DL, SDValue SrcVal, EVT SrcTy, - EVT DestTy) { - EVT SrcVecTy; - EVT DestVecTy; - if (DestTy.bitsGT(SrcTy)) { - DestVecTy = getPackedSVEVectorVT(DestTy); - SrcVecTy = DestVecTy.changeVectorElementType(SrcTy); - } else { - SrcVecTy = getPackedSVEVectorVT(SrcTy); - DestVecTy = SrcVecTy.changeVectorElementType(DestTy); - } - SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); - SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy, - DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); - Vec = DAG.getNode(Opc, DL, DestVecTy, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx); -} - /// Tries to replace scalar FP <-> conversions with SVE in streaming functions. static SDValue tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - // Uncomment to introduce extra fcvts. - // if (DCI.isBeforeLegalizeOps()) - // return SDValue(); - if (N->isStrictFPOpcode()) return SDValue(); @@ -19015,39 +18989,64 @@ tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible())) return SDValue(); - SDLoc DL(N); unsigned Opc = N->getOpcode(); + bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::FP_TO_SINT; + SDValue SrcVal = N->getOperand(0); EVT SrcTy = SrcVal.getValueType(); EVT DestTy = N->getValueType(0); - // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal - // type (unlike the equivalent nxv2f32 for floating-point types). - // May materialize extra instructions :( - if (SrcTy == MVT::i32 && DestTy == MVT::f64) { - SDValue ExtSrc = DAG.getNode(Opc == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, - DL, MVT::i64, SrcVal); - return createScalarSVEFPConversion(DAG, Opc, DL, ExtSrc, MVT::i64, - MVT::f64); + EVT SrcVecTy; + EVT DestVecTy; + if (DestTy.bitsGT(SrcTy)) { + DestVecTy = getPackedSVEVectorVT(DestTy); + SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy) + : DestVecTy.changeVectorElementType(SrcTy); + } else { + SrcVecTy = getPackedSVEVectorVT(SrcTy); + DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy) + : SrcVecTy.changeVectorElementType(DestTy); } - if (SrcTy == MVT::f64 && DestTy == MVT::i32) { - SDValue ExtDest = - createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, MVT::f64, MVT::i64); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ExtDest); + + SDLoc DL(N); + SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy, + DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); + + // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal + // type (unlike the equivalent nxv2f32 for floating-point types). So, + // unfortunately, the only way to lower to these variants is via the + // intrinsics. Note: We could sign/zero extend to the i64 variant, but that + // may result in extra extends or fmovs in the final assembly. + bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64; + bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32; + if (IsI32ToF64 || isF64ToI32) { + unsigned IntrinsicOpc; + if (IsI32ToF64) + IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32 + : Intrinsic::aarch64_sve_ucvtf_f64i32; + else + IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64 + : Intrinsic::aarch64_sve_fcvtzu_i32f64; + SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64); + Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy, + {DAG.getConstant(IntrinsicOpc, DL, MVT::i32), + DAG.getUNDEF(DestTy), PTrue, Vec}); + } else { + Vec = DAG.getNode(Opc, DL, DestVecTy, Vec); } - return createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, SrcTy, DestTy); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; - if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget)) + if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) return Res; EVT VT = N->getValueType(0); @@ -19088,7 +19087,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget)) + if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) return Res; if (!Subtarget->isNeonAvailable()) @@ -26110,7 +26109,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG, DCI, Subtarget); + return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FP_TO_SINT_SAT: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll index 6ef5a0b985b59..300ccefc71c91 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll @@ -84,9 +84,8 @@ define i32 @f64_to_s32(double %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: f64_to_s32: @@ -195,9 +194,8 @@ define i32 @f64_to_u32(double %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.d +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: f64_to_u32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll index 59b6a4a69e5d1..0a00ce69587b0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll @@ -45,11 +45,9 @@ entry: define double @s32_to_f64(i32 %x) { ; CHECK-LABEL: s32_to_f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -102,10 +100,9 @@ entry: define double @u32_to_f64(i32 %x) { ; CHECK-LABEL: u32_to_f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 4add5d8a23ac9..5e162fbfef196 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -1166,7 +1166,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -2867,7 +2867,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.d ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 18d4209bb76e4..24ad0f502dbf3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -392,8 +392,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2836,10 +2836,10 @@ define float @scvtf_i16_f32(ptr %0) { define double @scvtf_i16_f64(ptr %0) { ; CHECK-LABEL: scvtf_i16_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh x8, [x0] +; CHECK-NEXT: ldrsh w8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2895,10 +2895,9 @@ define float @scvtf_i32_f32(ptr %0) { define double @scvtf_i32_f64(ptr %0) { ; CHECK-LABEL: scvtf_i32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsw x8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: scvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -3016,8 +3015,8 @@ define double @ucvtf_i16_f64(ptr %0) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -3073,10 +3072,9 @@ define float @ucvtf_i32_f32(ptr %0) { define double @ucvtf_i32_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; From 7999e0b0e693f85be771d09b74e349a8389e192c Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 25 Oct 2024 16:41:10 +0000 Subject: [PATCH 9/9] Minor tweaks --- .../Target/AArch64/AArch64ISelLowering.cpp | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2b9420dd10003..56d901e6e3987 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18961,13 +18961,18 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -/// Tries to replace scalar FP <-> conversions with SVE in streaming functions. +/// Tries to replace scalar FP <-> INT conversions with SVE in streaming +/// functions, this can help to reduce the number of fmovs to/from GPRs. static SDValue -tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (N->isStrictFPOpcode()) return SDValue(); + if (!Subtarget->isSVEorStreamingSVEAvailable() || + (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible())) + return SDValue(); + auto isSupportedType = [](EVT VT) { if (!VT.isSimple()) return false; @@ -18982,13 +18987,6 @@ tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, !isSupportedType(N->getOperand(0).getValueType())) return SDValue(); - // If we are in a streaming[-compatible] function, use SVE for scalar FP <-> - // INT conversions as this can help avoid moves between GPRs and FPRs, which - // could be quite expensive. - if (!Subtarget->isSVEorStreamingSVEAvailable() || - (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible())) - return SDValue(); - unsigned Opc = N->getOpcode(); bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::FP_TO_SINT; @@ -19014,10 +19012,10 @@ tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx); // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal - // type (unlike the equivalent nxv2f32 for floating-point types). So, - // unfortunately, the only way to lower to these variants is via the - // intrinsics. Note: We could sign/zero extend to the i64 variant, but that - // may result in extra extends or fmovs in the final assembly. + // type (unlike the equivalent nxv2f32 for floating-point types). So the only + // way to lower to these variants is via the intrinsics. Note: We could + // sign/zero extend to the i64 variant, but that may result in extra extends + // or fmovs in the final assembly. bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64; bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32; if (IsI32ToF64 || isF64ToI32) { @@ -19046,7 +19044,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; - if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) + if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) return Res; EVT VT = N->getValueType(0); @@ -19087,7 +19085,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) + if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget)) return Res; if (!Subtarget->isNeonAvailable())