llvm · SpencerAbson · Dec 20, 2024 · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20945,17 +20945,47 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performTruncateCombine(SDNode *N,
-                                      SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
       N0.getOpcode() == AArch64ISD::DUP) {
     SDValue Op = N0.getOperand(0);
     if (VT.getScalarType() == MVT::i32 &&
         N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
-      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
+      Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
+    return DAG.getNode(N0.getOpcode(), DL, VT, Op);
+  }
+
+  // Performing the following combine produces a preferable form for ISEL.
+  // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
+  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      N0.hasOneUse()) {
+    SDValue Op = N0.getOperand(0);
+    SDValue ExtractIndexNode = N0.getOperand(1);
+    if (!isa<ConstantSDNode>(ExtractIndexNode))
+      return SDValue();
+
+    // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
+    // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
+    assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
+           "Unexpected legalisation result!");
+
+    EVT SrcVectorType = Op.getValueType();
+    // We also assume that SrcVectorType cannot be a V64 (see
+    // LowerEXTRACT_VECTOR_ELT).
+    assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
+           "Unexpected legalisation result!");
+
+    unsigned ExtractIndex =
+        cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
+    MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+
+    Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                       DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
   }
 
   return SDValue();
@@ -26258,7 +26288,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
   case ISD::TRUNCATE:
-    return performTruncateCombine(N, DAG);
+    return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
     return performFlagSettingCombine(N, DCI, ISD::AND);
   case AArch64ISD::ADC:

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6977,6 +6977,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
 def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
           (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
 
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
 // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
 // instruction even if the types don't match: we just have to remap the lane
 // carefully. N.b. this trick only applies to truncations.
@@ -6990,44 +6996,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
-                            ValueType Src128VT, ValueType ScalVT,
-                            Instruction DUP, SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
-                                                     imm:$idx)))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
-                                                     imm:$idx)))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
-                               SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
-                                                         imm:$idx))))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
-                                                       imm:$idx))))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+           Instruction DUP, SDNodeXForm IdxXFORM>
+  : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
+        (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8,  v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8,  v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;

diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:

diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z1.s[4]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z1.s[4]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z3.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z5.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>

diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-LABEL: test_lane4_2xi1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov z0.d, z0.d[4]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov z0.s, z0.s[8]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 2 x i1> %a, i32 4

diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
 ; CHECK-LABEL: extract_v2i1_nxv2i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov v0.s[1], v0.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
   ret <2 x i1> %mask