Skip to content

[AArch64] Avoid GPR trip when moving truncated i32 vector elements #114541

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 20, 2024
40 changes: 35 additions & 5 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20945,17 +20945,47 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}

static SDValue performTruncateCombine(SDNode *N,
SelectionDAG &DAG) {
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
N0.getOpcode() == AArch64ISD::DUP) {
SDValue Op = N0.getOperand(0);
if (VT.getScalarType() == MVT::i32 &&
N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
return DAG.getNode(N0.getOpcode(), DL, VT, Op);
}

// Performing the following combine produces a preferable form for ISEL.
// i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
N0.hasOneUse()) {
SDValue Op = N0.getOperand(0);
SDValue ExtractIndexNode = N0.getOperand(1);
if (!isa<ConstantSDNode>(ExtractIndexNode))
return SDValue();

// For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
// So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
"Unexpected legalisation result!");

EVT SrcVectorType = Op.getValueType();
// We also assume that SrcVectorType cannot be a V64 (see
// LowerEXTRACT_VECTOR_ELT).
assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
"Unexpected legalisation result!");

unsigned ExtractIndex =
cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;

Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
}

return SDValue();
Expand Down Expand Up @@ -26258,7 +26288,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:
return performTruncateCombine(N, DAG);
return performTruncateCombine(N, DAG, DCI);
case AArch64ISD::ANDS:
return performFlagSettingCombine(N, DCI, ISD::AND);
case AArch64ISD::ADC:
Expand Down
58 changes: 20 additions & 38 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6977,6 +6977,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;

// Also covers DUP (truncate i64 to i32)
def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
(DUPv2i32lane V128:$Rn, imm:$idx)>;
def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
(DUPv4i32lane V128:$Rn, imm:$idx)>;

// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
Expand All @@ -6990,44 +6996,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;

multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
ValueType Src128VT, ValueType ScalVT,
Instruction DUP, SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
imm:$idx)))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}

defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;

defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
imm:$idx))))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
imm:$idx))))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}

defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;

defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
Instruction DUP, SDNodeXForm IdxXFORM>
: Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

// DUP (truncate i16 to i8)
def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
// DUP (truncate i32/64 to i8)
def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
// DUP (truncate i32/i64 to i16)
def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret

entry:
Expand Down
136 changes: 136 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s

; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
; truncated size to avoid pointless GPR trips.


define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
; CHECK-LABEL: test_s_trunc_d_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <1 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_d_qlane1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[0], v1.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_d_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%c = extractelement <1 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 0
ret <4 x i32> %e
}

define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_d_qlane1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[3], v1.s[2]
; CHECK-NEXT: ret
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}

; ---- From the bottom 128b of an SVE vector

define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[1], v1.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 1
ret <2 x i32> %e
}

; (negative test) Extracted element is not within V-register.
define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.s, z1.s[4]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[1], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 2
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 1
ret <2 x i32> %e
}

define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 0
ret <4 x i32> %e
}

define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[3], v1.s[2]
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}

; (negative test) Extracted element is not within V-register.
define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.s, z1.s[4]
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[3], w8
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 2
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AArch64/sve-doublereduct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
Expand All @@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z1.h, z3.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
Expand All @@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z2.h, z5.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/sve-extract-element.ll
Original file line number Diff line number Diff line change
Expand Up @@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
; CHECK-LABEL: test_lane4_2xi1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: mov z0.d, z0.d[4]
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: mov z0.s, z0.s[8]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%b = extractelement <vscale x 2 x i1> %a, i32 4
Expand Down
7 changes: 2 additions & 5 deletions llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
; CHECK-LABEL: extract_v2i1_nxv2i1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov v0.s[1], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: mov v0.s[1], v0.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
ret <2 x i1> %mask
Expand Down
Loading
Loading