From 8afbbe98272d7aaddd9436a90e9f45d2a14e7e46 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 23 Oct 2024 14:23:56 +0000 Subject: [PATCH 1/3] Initial working version using umax + or reduction + select --- llvm/docs/LangRef.rst | 30 + llvm/include/llvm/IR/Intrinsics.td | 6 + .../SelectionDAG/SelectionDAGBuilder.cpp | 23 + .../CodeGen/AArch64/vector-masked-extract.ll | 663 ++++++++++++++++++ 4 files changed, 722 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 6fb66ce231e8a..11c848039b849 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20004,6 +20004,36 @@ the follow sequence of operations: The ``mask`` operand will apply to at least the gather and scatter operations. +'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This is an overloaded intrinsic. + +This intrinsic will extract the value from a single lane of a vector, based +on a supplied mask vector. + +:: + + declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) + declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) + +Arguments: +"""""""""" + +The first argument is the data vector to extract a lane from. The second is a +mask vector controlling the extraction. The third argument is a passthru +value. + +The two input vectors must have the same number of elements, and the type of +the passthru value must match that of the elements of the data vector. + +Semantics: +"""""""""" + +The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will +find the index of the most significant active lane in the mask vector, and +extract the element at that index in the corresponding data vector. If no mask +lanes are active then the passthru value is returned instead. .. _int_vector_compress: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 57c9bf50fb5a1..3a24f97f4cfa2 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1928,6 +1928,12 @@ def int_experimental_vector_match : DefaultAttrsIntrinsic< LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], // Mask [ IntrNoMem, IntrNoSync, IntrWillReturn ]>; +// Extract based on mask bits +def int_experimental_vector_masked_extract_last_active: + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMVectorElementType<0>], [IntrNoMem]>; + // Operators let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { // Integer arithmetic diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 3c10ebbd22bb0..b63ed6fc12d52 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8236,6 +8236,29 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorHistogram(I, Intrinsic); return; } + case Intrinsic::experimental_vector_masked_extract_last_active: { + SDValue Data = getValue(I.getOperand(0)); + SDValue Mask = getValue(I.getOperand(1)); + SDValue PassThru = getValue(I.getOperand(2)); + + EVT DataVT = Data.getValueType(); + EVT ScalarVT = PassThru.getValueType(); + EVT BoolVT = Mask.getValueType().getScalarType(); + EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT); + + SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT); + SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT); + SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes); + SDValue HighestIdx = + DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts); + SDValue Extract = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx); + SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); + SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru); + setValue(&I, Result); + return; + } } } diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll new file mode 100644 index 0000000000000..04adf4e476b04 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll @@ -0,0 +1,663 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED +; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED + +define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) { +; NEON-FIXED-LABEL: extract_last_i8: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: umov w15, v1.b[14] +; NEON-FIXED-NEXT: umov w14, v1.b[6] +; NEON-FIXED-NEXT: adrp x8, .LCPI0_0 +; NEON-FIXED-NEXT: umov w12, v1.b[15] +; NEON-FIXED-NEXT: umov w13, v1.b[10] +; NEON-FIXED-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; NEON-FIXED-NEXT: umov w11, v1.b[2] +; NEON-FIXED-NEXT: umov w8, v1.b[7] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: umov w9, v1.b[11] +; NEON-FIXED-NEXT: umov w10, v1.b[3] +; NEON-FIXED-NEXT: umov w16, v1.b[12] +; NEON-FIXED-NEXT: fmov s3, w15 +; NEON-FIXED-NEXT: umov w15, v1.b[4] +; NEON-FIXED-NEXT: fmov s4, w14 +; NEON-FIXED-NEXT: fmov s5, w13 +; NEON-FIXED-NEXT: umov w13, v1.b[0] +; NEON-FIXED-NEXT: umov w14, v1.b[13] +; NEON-FIXED-NEXT: fmov s6, w11 +; NEON-FIXED-NEXT: umov w11, v1.b[5] +; NEON-FIXED-NEXT: mov v3.s[1], w12 +; NEON-FIXED-NEXT: umov w12, v1.b[8] +; NEON-FIXED-NEXT: mov v4.s[1], w8 +; NEON-FIXED-NEXT: umov w8, v1.b[9] +; NEON-FIXED-NEXT: mov v5.s[1], w9 +; NEON-FIXED-NEXT: umov w9, v1.b[1] +; NEON-FIXED-NEXT: fmov s7, w16 +; NEON-FIXED-NEXT: fmov s16, w15 +; NEON-FIXED-NEXT: mov v6.s[1], w10 +; NEON-FIXED-NEXT: fmov s18, w13 +; NEON-FIXED-NEXT: shl v1.16b, v1.16b, #7 +; NEON-FIXED-NEXT: fmov s17, w12 +; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0 +; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0 +; NEON-FIXED-NEXT: mov v7.s[1], w14 +; NEON-FIXED-NEXT: mov v16.s[1], w11 +; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0 +; NEON-FIXED-NEXT: mov v18.s[1], w9 +; NEON-FIXED-NEXT: adrp x9, .LCPI0_2 +; NEON-FIXED-NEXT: ushll v6.2d, v6.2s, #0 +; NEON-FIXED-NEXT: ldr q20, [x9, :lo12:.LCPI0_2] +; NEON-FIXED-NEXT: adrp x9, .LCPI0_7 +; NEON-FIXED-NEXT: mov v17.s[1], w8 +; NEON-FIXED-NEXT: adrp x8, .LCPI0_1 +; NEON-FIXED-NEXT: ldr q23, [x9, :lo12:.LCPI0_7] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_1] +; NEON-FIXED-NEXT: adrp x8, .LCPI0_3 +; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63 +; NEON-FIXED-NEXT: ushll v7.2d, v7.2s, #0 +; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63 +; NEON-FIXED-NEXT: ushll v16.2d, v16.2s, #0 +; NEON-FIXED-NEXT: ushll v17.2d, v17.2s, #0 +; NEON-FIXED-NEXT: shl v6.2d, v6.2d, #63 +; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; NEON-FIXED-NEXT: ushll v18.2d, v18.2s, #0 +; NEON-FIXED-NEXT: cmlt v1.16b, v1.16b, #0 +; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 +; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 +; NEON-FIXED-NEXT: cmlt v6.2d, v6.2d, #0 +; NEON-FIXED-NEXT: and v2.16b, v3.16b, v2.16b +; NEON-FIXED-NEXT: shl v3.2d, v7.2d, #63 +; NEON-FIXED-NEXT: shl v7.2d, v16.2d, #63 +; NEON-FIXED-NEXT: shl v16.2d, v17.2d, #63 +; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI0_3] +; NEON-FIXED-NEXT: adrp x8, .LCPI0_4 +; NEON-FIXED-NEXT: ldr q21, [x8, :lo12:.LCPI0_4] +; NEON-FIXED-NEXT: adrp x8, .LCPI0_5 +; NEON-FIXED-NEXT: shl v18.2d, v18.2d, #63 +; NEON-FIXED-NEXT: ldr q22, [x8, :lo12:.LCPI0_5] +; NEON-FIXED-NEXT: adrp x8, .LCPI0_6 +; NEON-FIXED-NEXT: and v4.16b, v4.16b, v19.16b +; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_6] +; NEON-FIXED-NEXT: cmlt v16.2d, v16.2d, #0 +; NEON-FIXED-NEXT: and v5.16b, v5.16b, v20.16b +; NEON-FIXED-NEXT: cmlt v18.2d, v18.2d, #0 +; NEON-FIXED-NEXT: and v6.16b, v6.16b, v17.16b +; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; NEON-FIXED-NEXT: cmlt v7.2d, v7.2d, #0 +; NEON-FIXED-NEXT: umaxv b1, v1.16b +; NEON-FIXED-NEXT: and v16.16b, v16.16b, v19.16b +; NEON-FIXED-NEXT: and v17.16b, v18.16b, v23.16b +; NEON-FIXED-NEXT: cmhi v18.2d, v4.2d, v2.2d +; NEON-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d +; NEON-FIXED-NEXT: and v3.16b, v3.16b, v21.16b +; NEON-FIXED-NEXT: and v7.16b, v7.16b, v22.16b +; NEON-FIXED-NEXT: cmhi v21.2d, v17.2d, v16.2d +; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v18.16b +; NEON-FIXED-NEXT: mov v4.16b, v19.16b +; NEON-FIXED-NEXT: cmhi v20.2d, v7.2d, v3.2d +; NEON-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b +; NEON-FIXED-NEXT: mov v5.16b, v21.16b +; NEON-FIXED-NEXT: bit v3.16b, v7.16b, v20.16b +; NEON-FIXED-NEXT: bsl v5.16b, v17.16b, v16.16b +; NEON-FIXED-NEXT: cmhi v6.2d, v4.2d, v2.2d +; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v3.2d +; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v6.16b +; NEON-FIXED-NEXT: bit v3.16b, v5.16b, v7.16b +; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d +; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b +; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; NEON-FIXED-NEXT: cmhi d4, d2, d3 +; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b +; NEON-FIXED-NEXT: fmov x8, d2 +; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4 +; NEON-FIXED-NEXT: ldrb w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i8: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: umov w8, v1.b[14] +; SVE-FIXED-NEXT: umov w9, v1.b[6] +; SVE-FIXED-NEXT: index z2.d, #0, #1 +; SVE-FIXED-NEXT: umov w12, v1.b[2] +; SVE-FIXED-NEXT: umov w10, v1.b[10] +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: umov w13, v1.b[12] +; SVE-FIXED-NEXT: umov w11, v1.b[15] +; SVE-FIXED-NEXT: umov w14, v1.b[4] +; SVE-FIXED-NEXT: umov w16, v1.b[0] +; SVE-FIXED-NEXT: umov w15, v1.b[8] +; SVE-FIXED-NEXT: fmov s3, w8 +; SVE-FIXED-NEXT: umov w8, v1.b[7] +; SVE-FIXED-NEXT: fmov s4, w9 +; SVE-FIXED-NEXT: umov w9, v1.b[11] +; SVE-FIXED-NEXT: fmov s6, w12 +; SVE-FIXED-NEXT: umov w12, v1.b[3] +; SVE-FIXED-NEXT: fmov s5, w10 +; SVE-FIXED-NEXT: umov w10, v1.b[1] +; SVE-FIXED-NEXT: fmov s7, w13 +; SVE-FIXED-NEXT: umov w13, v1.b[13] +; SVE-FIXED-NEXT: fmov s16, w14 +; SVE-FIXED-NEXT: fmov s18, w16 +; SVE-FIXED-NEXT: mov v4.s[1], w8 +; SVE-FIXED-NEXT: umov w8, v1.b[5] +; SVE-FIXED-NEXT: mov v3.s[1], w11 +; SVE-FIXED-NEXT: mov v5.s[1], w9 +; SVE-FIXED-NEXT: mov v6.s[1], w12 +; SVE-FIXED-NEXT: umov w9, v1.b[9] +; SVE-FIXED-NEXT: fmov s17, w15 +; SVE-FIXED-NEXT: mov v18.s[1], w10 +; SVE-FIXED-NEXT: mov z19.d, z2.d +; SVE-FIXED-NEXT: mov v7.s[1], w13 +; SVE-FIXED-NEXT: mov z20.d, z2.d +; SVE-FIXED-NEXT: mov z21.d, z2.d +; SVE-FIXED-NEXT: mov v16.s[1], w8 +; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0 +; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0 +; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0 +; SVE-FIXED-NEXT: ushll v6.2d, v6.2s, #0 +; SVE-FIXED-NEXT: mov v17.s[1], w9 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: ushll v18.2d, v18.2s, #0 +; SVE-FIXED-NEXT: mov z25.d, z2.d +; SVE-FIXED-NEXT: ushll v7.2d, v7.2s, #0 +; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63 +; SVE-FIXED-NEXT: ushll v16.2d, v16.2s, #0 +; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63 +; SVE-FIXED-NEXT: shl v6.2d, v6.2d, #63 +; SVE-FIXED-NEXT: mov z22.d, z2.d +; SVE-FIXED-NEXT: mov z23.d, z2.d +; SVE-FIXED-NEXT: add z19.d, z19.d, #6 // =0x6 +; SVE-FIXED-NEXT: shl v18.2d, v18.2d, #63 +; SVE-FIXED-NEXT: ushll v17.2d, v17.2s, #0 +; SVE-FIXED-NEXT: shl v7.2d, v7.2d, #63 +; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 +; SVE-FIXED-NEXT: add z25.d, z25.d, #14 // =0xe +; SVE-FIXED-NEXT: shl v16.2d, v16.2d, #63 +; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 +; SVE-FIXED-NEXT: add z20.d, z20.d, #10 // =0xa +; SVE-FIXED-NEXT: cmlt v6.2d, v6.2d, #0 +; SVE-FIXED-NEXT: add z21.d, z21.d, #2 // =0x2 +; SVE-FIXED-NEXT: mov z24.d, z2.d +; SVE-FIXED-NEXT: shl v17.2d, v17.2d, #63 +; SVE-FIXED-NEXT: cmlt v18.2d, v18.2d, #0 +; SVE-FIXED-NEXT: cmlt v7.2d, v7.2d, #0 +; SVE-FIXED-NEXT: add z22.d, z22.d, #12 // =0xc +; SVE-FIXED-NEXT: cmlt v16.2d, v16.2d, #0 +; SVE-FIXED-NEXT: add z23.d, z23.d, #4 // =0x4 +; SVE-FIXED-NEXT: and v3.16b, v3.16b, v25.16b +; SVE-FIXED-NEXT: and v4.16b, v4.16b, v19.16b +; SVE-FIXED-NEXT: and v5.16b, v5.16b, v20.16b +; SVE-FIXED-NEXT: and v6.16b, v6.16b, v21.16b +; SVE-FIXED-NEXT: cmlt v17.2d, v17.2d, #0 +; SVE-FIXED-NEXT: add z24.d, z24.d, #8 // =0x8 +; SVE-FIXED-NEXT: and v2.16b, v18.16b, v2.16b +; SVE-FIXED-NEXT: and v7.16b, v7.16b, v22.16b +; SVE-FIXED-NEXT: and v16.16b, v16.16b, v23.16b +; SVE-FIXED-NEXT: cmhi v18.2d, v4.2d, v3.2d +; SVE-FIXED-NEXT: shl v1.16b, v1.16b, #7 +; SVE-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d +; SVE-FIXED-NEXT: and v17.16b, v17.16b, v24.16b +; SVE-FIXED-NEXT: cmhi v20.2d, v16.2d, v7.2d +; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v18.16b +; SVE-FIXED-NEXT: cmlt v1.16b, v1.16b, #0 +; SVE-FIXED-NEXT: mov v4.16b, v19.16b +; SVE-FIXED-NEXT: cmhi v21.2d, v2.2d, v17.2d +; SVE-FIXED-NEXT: umaxv b1, v1.16b +; SVE-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b +; SVE-FIXED-NEXT: mov v5.16b, v20.16b +; SVE-FIXED-NEXT: bif v2.16b, v17.16b, v21.16b +; SVE-FIXED-NEXT: bsl v5.16b, v16.16b, v7.16b +; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d +; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d +; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b +; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b +; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d +; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b +; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; SVE-FIXED-NEXT: cmhi d4, d2, d3 +; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b +; SVE-FIXED-NEXT: fmov x8, d2 +; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4 +; SVE-FIXED-NEXT: ldrb w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) + ret i8 %res +} + +define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) { +; NEON-FIXED-LABEL: extract_last_i16: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1 +; NEON-FIXED-NEXT: umov w8, v1.b[6] +; NEON-FIXED-NEXT: umov w9, v1.b[2] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: umov w11, v1.b[4] +; NEON-FIXED-NEXT: umov w12, v1.b[0] +; NEON-FIXED-NEXT: umov w10, v1.b[7] +; NEON-FIXED-NEXT: umov w13, v1.b[3] +; NEON-FIXED-NEXT: umov w14, v1.b[5] +; NEON-FIXED-NEXT: umov w15, v1.b[1] +; NEON-FIXED-NEXT: shl v1.8b, v1.8b, #7 +; NEON-FIXED-NEXT: fmov s2, w8 +; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 +; NEON-FIXED-NEXT: fmov s3, w9 +; NEON-FIXED-NEXT: fmov s4, w11 +; NEON-FIXED-NEXT: adrp x9, .LCPI1_1 +; NEON-FIXED-NEXT: ldr q6, [x8, :lo12:.LCPI1_0] +; NEON-FIXED-NEXT: fmov s5, w12 +; NEON-FIXED-NEXT: adrp x8, .LCPI1_3 +; NEON-FIXED-NEXT: ldr q7, [x9, :lo12:.LCPI1_1] +; NEON-FIXED-NEXT: mov v2.s[1], w10 +; NEON-FIXED-NEXT: mov v3.s[1], w13 +; NEON-FIXED-NEXT: adrp x10, .LCPI1_2 +; NEON-FIXED-NEXT: mov v4.s[1], w14 +; NEON-FIXED-NEXT: ldr q16, [x10, :lo12:.LCPI1_2] +; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI1_3] +; NEON-FIXED-NEXT: mov v5.s[1], w15 +; NEON-FIXED-NEXT: cmlt v1.8b, v1.8b, #0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0 +; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0 +; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0 +; NEON-FIXED-NEXT: umaxv b1, v1.8b +; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0 +; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63 +; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63 +; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63 +; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 +; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 +; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 +; NEON-FIXED-NEXT: and v2.16b, v2.16b, v6.16b +; NEON-FIXED-NEXT: and v3.16b, v3.16b, v7.16b +; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b +; NEON-FIXED-NEXT: and v5.16b, v5.16b, v17.16b +; NEON-FIXED-NEXT: cmhi v6.2d, v3.2d, v2.2d +; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v4.2d +; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v6.16b +; NEON-FIXED-NEXT: mov v3.16b, v7.16b +; NEON-FIXED-NEXT: bsl v3.16b, v5.16b, v4.16b +; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d +; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b +; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; NEON-FIXED-NEXT: cmhi d4, d2, d3 +; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b +; NEON-FIXED-NEXT: fmov x8, d2 +; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 +; NEON-FIXED-NEXT: ldrh w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i16: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1 +; SVE-FIXED-NEXT: umov w8, v1.b[0] +; SVE-FIXED-NEXT: umov w10, v1.b[6] +; SVE-FIXED-NEXT: index z6.d, #0, #1 +; SVE-FIXED-NEXT: umov w11, v1.b[2] +; SVE-FIXED-NEXT: umov w14, v1.b[4] +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: umov w9, v1.b[1] +; SVE-FIXED-NEXT: umov w12, v1.b[7] +; SVE-FIXED-NEXT: umov w13, v1.b[3] +; SVE-FIXED-NEXT: fmov s2, w8 +; SVE-FIXED-NEXT: umov w8, v1.b[5] +; SVE-FIXED-NEXT: fmov s3, w10 +; SVE-FIXED-NEXT: fmov s4, w11 +; SVE-FIXED-NEXT: fmov s5, w14 +; SVE-FIXED-NEXT: mov z7.d, z6.d +; SVE-FIXED-NEXT: mov z16.d, z6.d +; SVE-FIXED-NEXT: mov z17.d, z6.d +; SVE-FIXED-NEXT: shl v1.8b, v1.8b, #7 +; SVE-FIXED-NEXT: mov v2.s[1], w9 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: mov v3.s[1], w12 +; SVE-FIXED-NEXT: mov v4.s[1], w13 +; SVE-FIXED-NEXT: mov v5.s[1], w8 +; SVE-FIXED-NEXT: add z7.d, z7.d, #2 // =0x2 +; SVE-FIXED-NEXT: add z17.d, z17.d, #6 // =0x6 +; SVE-FIXED-NEXT: add z16.d, z16.d, #4 // =0x4 +; SVE-FIXED-NEXT: cmlt v1.8b, v1.8b, #0 +; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0 +; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0 +; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0 +; SVE-FIXED-NEXT: umaxv b1, v1.8b +; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63 +; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63 +; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63 +; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 +; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 +; SVE-FIXED-NEXT: and v2.16b, v2.16b, v6.16b +; SVE-FIXED-NEXT: and v3.16b, v3.16b, v17.16b +; SVE-FIXED-NEXT: and v4.16b, v4.16b, v7.16b +; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b +; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d +; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d +; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b +; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b +; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d +; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b +; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; SVE-FIXED-NEXT: cmhi d4, d2, d3 +; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b +; SVE-FIXED-NEXT: fmov x8, d2 +; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 +; SVE-FIXED-NEXT: ldrh w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) + ret i16 %res +} + +define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) { +; NEON-FIXED-LABEL: extract_last_i32: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: ushll v2.4s, v1.4h, #0 +; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 +; NEON-FIXED-NEXT: adrp x9, .LCPI2_1 +; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] +; NEON-FIXED-NEXT: ldr q5, [x9, :lo12:.LCPI2_1] +; NEON-FIXED-NEXT: shl v1.4h, v1.4h, #15 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0 +; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0 +; NEON-FIXED-NEXT: cmlt v1.4h, v1.4h, #0 +; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63 +; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 +; NEON-FIXED-NEXT: and v3.16b, v3.16b, v4.16b +; NEON-FIXED-NEXT: and v2.16b, v2.16b, v5.16b +; NEON-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d +; NEON-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b +; NEON-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b +; NEON-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; NEON-FIXED-NEXT: cmhi d4, d3, d2 +; NEON-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b +; NEON-FIXED-NEXT: fmov x8, d2 +; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 +; NEON-FIXED-NEXT: ldr w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i32: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: ushll v2.4s, v1.4h, #0 +; SVE-FIXED-NEXT: index z4.d, #0, #1 +; SVE-FIXED-NEXT: shl v1.4h, v1.4h, #15 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0 +; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-FIXED-NEXT: cmlt v1.4h, v1.4h, #0 +; SVE-FIXED-NEXT: mov z5.d, z4.d +; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 +; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63 +; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: add z5.d, z5.d, #2 // =0x2 +; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-FIXED-NEXT: and v2.16b, v2.16b, v4.16b +; SVE-FIXED-NEXT: and v3.16b, v3.16b, v5.16b +; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d +; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b +; SVE-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b +; SVE-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; SVE-FIXED-NEXT: cmhi d4, d3, d2 +; SVE-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b +; SVE-FIXED-NEXT: fmov x8, d2 +; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 +; SVE-FIXED-NEXT: ldr w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) + ret i32 %res +} + +define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) { +; CHECK-LABEL: extract_last_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ushll v3.2d, v1.2s, #0 +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov d2, xzr +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: shl v1.2s, v1.2s, #31 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: shl v3.2d, v3.2d, #63 +; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 +; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 +; CHECK-NEXT: umaxp v1.2s, v1.2s, v1.2s +; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: and v3.8b, v3.8b, v4.8b +; CHECK-NEXT: cmhi d2, d2, d3 +; CHECK-NEXT: bic v2.8b, v3.8b, v2.8b +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: orr x8, x9, x8, lsl #3 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: tst w9, #0x1 +; CHECK-NEXT: csel x0, x8, x0, ne +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) + ret i64 %res +} + +define i8 @extract_last_i8_scalable( %data, %mask, i8 %passthru) #0 { +; CHECK-LABEL: extract_last_i8_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: mov z3.d, #0 // =0x0 +; CHECK-NEXT: punpkhi p4.h, p0.b +; CHECK-NEXT: punpklo p5.h, p2.b +; CHECK-NEXT: punpkhi p1.h, p4.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p4.b +; CHECK-NEXT: incd z2.d +; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: punpklo p4.h, p5.b +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: punpkhi p6.h, p1.b +; CHECK-NEXT: punpkhi p7.h, p3.b +; CHECK-NEXT: sel z1.d, p4, z1.d, z3.d +; CHECK-NEXT: mov z4.d, z2.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z25.d, z5.d +; CHECK-NEXT: punpkhi p5.h, p5.b +; CHECK-NEXT: punpkhi p4.h, p2.b +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z25.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: sel z2.d, p5, z2.d, z3.d +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: mov z24.d, z4.d +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: sel z5.d, p3, z5.d, z3.d +; CHECK-NEXT: sel z4.d, p7, z4.d, z3.d +; CHECK-NEXT: sel z6.d, p2, z6.d, z3.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z25.d, p1, z25.d, z3.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z6.d +; CHECK-NEXT: sel z24.d, p6, z24.d, z3.d +; CHECK-NEXT: mov z3.d, p4/m, z7.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: umax z4.d, p1/m, z4.d, z24.d +; CHECK-NEXT: umax z2.d, p1/m, z2.d, z3.d +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: umax z3.d, p1/m, z3.d, z25.d +; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d +; CHECK-NEXT: umaxv d1, p1, z1.d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: whilels p1.b, xzr, x8 +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: lastb w8, p1, z0.b +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8( %data, %mask, i8 %passthru) + ret i8 %res +} + +define i16 @extract_last_i16_scalable( %data, %mask, i16 %passthru) #0 { +; CHECK-LABEL: extract_last_i16_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: mov z5.d, #0 // =0x0 +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p3.h, p1.b +; CHECK-NEXT: punpkhi p4.h, p2.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: incd z2.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: sel z1.d, p2, z1.d, z5.d +; CHECK-NEXT: mov z4.d, z2.d +; CHECK-NEXT: sel z2.d, p4, z2.d, z5.d +; CHECK-NEXT: sel z3.d, p1, z3.d, z5.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d +; CHECK-NEXT: sel z4.d, p3, z4.d, z5.d +; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d +; CHECK-NEXT: umaxv d1, p1, z1.d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: whilels p1.h, xzr, x8 +; CHECK-NEXT: lastb w8, p1, z0.h +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) + ret i16 %res +} + +define i32 @extract_last_i32_scalable( %data, %mask, i32 %passthru) #0 { +; CHECK-LABEL: extract_last_i32_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z3.d, #0 // =0x0 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sel z1.d, p2, z1.d, z3.d +; CHECK-NEXT: incd z2.d +; CHECK-NEXT: sel z2.d, p1, z2.d, z3.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d +; CHECK-NEXT: umaxv d1, p1, z1.d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: whilels p1.s, xzr, x8 +; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32( %data, %mask, i32 %passthru) + ret i32 %res +} + +define i64 @extract_last_i64_scalable( %data, %mask, i64 %passthru) #0 { +; CHECK-LABEL: extract_last_i64_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d +; CHECK-NEXT: umaxv d1, p1, z1.d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: whilels p2.d, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb x8, p2, z0.d +; CHECK-NEXT: csel x0, x8, x0, ne +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64( %data, %mask, i64 %passthru) + ret i64 %res +} + +declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) +declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) +declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) +declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64) +declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(, , i8) +declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(, , i16) +declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(, , i32) +declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(, , i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } From c7e7193ad9c261ea50bdc79e64bac276bcc1f66d Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 7 Nov 2024 17:20:20 +0000 Subject: [PATCH 2/3] Address review comments --- llvm/docs/LangRef.rst | 19 +- llvm/include/llvm/IR/Intrinsics.td | 2 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 32 +- llvm/lib/IR/AutoUpgrade.cpp | 3 + .../AArch64/vector-extract-last-active.ll | 420 +++++++++++ .../CodeGen/AArch64/vector-masked-extract.ll | 663 ------------------ 6 files changed, 456 insertions(+), 683 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/vector-extract-last-active.ll delete mode 100644 llvm/test/CodeGen/AArch64/vector-masked-extract.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 11c848039b849..f67692f4bf17c 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20004,18 +20004,15 @@ the follow sequence of operations: The ``mask`` operand will apply to at least the gather and scatter operations. -'``llvm.experimental.vector.masked.extract.last.active``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.extract.last.active``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This is an overloaded intrinsic. -This intrinsic will extract the value from a single lane of a vector, based -on a supplied mask vector. - :: - declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) - declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) + declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) + declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) Arguments: """""""""" @@ -20030,10 +20027,10 @@ the passthru value must match that of the elements of the data vector. Semantics: """""""""" -The '``llvm.experimental.vector.masked.extract.last.active``' intrinsic will -find the index of the most significant active lane in the mask vector, and -extract the element at that index in the corresponding data vector. If no mask -lanes are active then the passthru value is returned instead. +The '``llvm.experimental.vector.extract.last.active``' intrinsic will extract an +element from the data vector at the index matching the highest active lane of +the mask vector. If no mask lanes are active then the passthru value is +returned instead. .. _int_vector_compress: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3a24f97f4cfa2..88a4201dcfe37 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1929,7 +1929,7 @@ def int_experimental_vector_match : DefaultAttrsIntrinsic< [ IntrNoMem, IntrNoSync, IntrWillReturn ]>; // Extract based on mask bits -def int_experimental_vector_masked_extract_last_active: +def int_experimental_vector_extract_last_active: DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMVectorElementType<0>], [IntrNoMem]>; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b63ed6fc12d52..600905421a357 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8236,7 +8236,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorHistogram(I, Intrinsic); return; } - case Intrinsic::experimental_vector_masked_extract_last_active: { + case Intrinsic::experimental_vector_extract_last_active: { SDValue Data = getValue(I.getOperand(0)); SDValue Mask = getValue(I.getOperand(1)); SDValue PassThru = getValue(I.getOperand(2)); @@ -8244,16 +8244,32 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, EVT DataVT = Data.getValueType(); EVT ScalarVT = PassThru.getValueType(); EVT BoolVT = Mask.getValueType().getScalarType(); - EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - EVT IdxVecVT = DataVT.changeVectorElementType(IdxVT); - SDValue Zeroes = DAG.getConstant(0, sdl, IdxVecVT); - SDValue StepVec = DAG.getStepVector(sdl, IdxVecVT); - SDValue ActiveElts = DAG.getSelect(sdl, IdxVecVT, Mask, StepVec, Zeroes); + // Find a suitable type for a stepvector. + ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. + if (DataVT.isScalableVector()) + VScaleRange = getVScaleRange(I.getCaller(), 64); + unsigned EltWidth = TLI.getBitWidthForCttzElements( + I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true, + &VScaleRange); + MVT StepVT = MVT::getIntegerVT(EltWidth); + EVT StepVecVT = DataVT.changeVectorElementType(StepVT); + + // Zero out lanes with inactive elements, then find the highest remaining + // value from the stepvector. + SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT); + SDValue StepVec = DAG.getStepVector(sdl, StepVecVT); + SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes); SDValue HighestIdx = - DAG.getNode(ISD::VECREDUCE_UMAX, sdl, IdxVT, ActiveElts); + DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts); + + // Extract the corresponding lane from the data vector + EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT); SDValue Extract = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, HighestIdx); + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx); + + // If all mask lanes were inactive, choose the passthru value instead. SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru); setValue(&I, Result); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d0e0da53307cf..e73538da282e9 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1119,6 +1119,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.consume_front("experimental.vector.")) { Intrinsic::ID ID = StringSwitch(Name) + // Skip over extract.last.active, otherwise it will be 'upgraded' + // to a regular vector extract which is a different operation. + .StartsWith("extract.last.active.", Intrinsic::not_intrinsic) .StartsWith("extract.", Intrinsic::vector_extract) .StartsWith("insert.", Intrinsic::vector_insert) .StartsWith("splice.", Intrinsic::vector_splice) diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll new file mode 100644 index 0000000000000..c0f1720e1cf8b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -0,0 +1,420 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED +; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED + +define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { +; NEON-FIXED-LABEL: extract_last_i8: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0 +; NEON-FIXED-NEXT: adrp x8, .LCPI0_0 +; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b +; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b +; NEON-FIXED-NEXT: umaxv b1, v1.16b +; NEON-FIXED-NEXT: umaxv b2, v2.16b +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4 +; NEON-FIXED-NEXT: ldrb w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i8: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: index z2.b, #0, #1 +; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0 +; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b +; SVE-FIXED-NEXT: umaxv b1, v1.16b +; SVE-FIXED-NEXT: umaxv b2, v2.16b +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4 +; SVE-FIXED-NEXT: ldrb w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <16 x i8> %mask, zeroinitializer + %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %notzero, i8 %passthru) + ret i8 %res +} + +define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { +; NEON-FIXED-LABEL: extract_last_i16: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h +; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: xtn v1.8b, v1.8h +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: umaxv b1, v1.8b +; NEON-FIXED-NEXT: umaxv b2, v2.8b +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 +; NEON-FIXED-NEXT: ldrh w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i16: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h +; SVE-FIXED-NEXT: index z2.b, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: xtn v1.8b, v1.8h +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: umaxv b1, v1.8b +; SVE-FIXED-NEXT: umaxv b2, v2.8b +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 +; SVE-FIXED-NEXT: ldrh w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <8 x i16> %mask, zeroinitializer + %res = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %notzero, i16 %passthru) + ret i16 %res +} + +define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { +; NEON-FIXED-LABEL: extract_last_i32: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s +; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: xtn v1.4h, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: umaxv h2, v2.4h +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 +; NEON-FIXED-NEXT: ldr w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i32: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s +; SVE-FIXED-NEXT: index z2.h, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: xtn v1.4h, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: umaxv h2, v2.4h +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 +; SVE-FIXED-NEXT: ldr w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <4 x i32> %mask, zeroinitializer + %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %notzero, i32 %passthru) + ret i32 %res +} + +define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { +; NEON-FIXED-LABEL: extract_last_i64: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d +; NEON-FIXED-NEXT: adrp x8, .LCPI3_0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: xtn v1.2s, v1.2d +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 +; NEON-FIXED-NEXT: ldr x8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel x0, x8, x0, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_i64: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d +; SVE-FIXED-NEXT: index z2.s, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: xtn v1.2s, v1.2d +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 +; SVE-FIXED-NEXT: ldr x8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel x0, x8, x0, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <2 x i64> %mask, zeroinitializer + %res = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %notzero, i64 %passthru) + ret i64 %res +} + +define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %passthru) { +; NEON-FIXED-LABEL: extract_last_float: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s +; NEON-FIXED-NEXT: adrp x8, .LCPI4_0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: xtn v1.4h, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: umaxv h3, v3.4h +; NEON-FIXED-NEXT: fmov w8, s3 +; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 +; NEON-FIXED-NEXT: fmov w8, s1 +; NEON-FIXED-NEXT: ldr s0, [x9] +; NEON-FIXED-NEXT: tst w8, #0x1 +; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_float: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s +; SVE-FIXED-NEXT: index z3.h, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: xtn v1.4h, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: umaxv h3, v3.4h +; SVE-FIXED-NEXT: fmov w8, s3 +; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 +; SVE-FIXED-NEXT: fmov w8, s1 +; SVE-FIXED-NEXT: ldr s0, [x9] +; SVE-FIXED-NEXT: tst w8, #0x1 +; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <4 x i32> %mask, zeroinitializer + %res = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> %data, <4 x i1> %notzero, float %passthru) + ret float %res +} + +define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %passthru) { +; NEON-FIXED-LABEL: extract_last_double: +; NEON-FIXED: // %bb.0: +; NEON-FIXED-NEXT: sub sp, sp, #16 +; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 +; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d +; NEON-FIXED-NEXT: adrp x8, .LCPI5_0 +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] +; NEON-FIXED-NEXT: str q0, [sp] +; NEON-FIXED-NEXT: xtn v1.2s, v1.2d +; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s +; NEON-FIXED-NEXT: fmov w8, s3 +; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 +; NEON-FIXED-NEXT: fmov w8, s1 +; NEON-FIXED-NEXT: ldr d0, [x9] +; NEON-FIXED-NEXT: tst w8, #0x1 +; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne +; NEON-FIXED-NEXT: add sp, sp, #16 +; NEON-FIXED-NEXT: ret +; +; SVE-FIXED-LABEL: extract_last_double: +; SVE-FIXED: // %bb.0: +; SVE-FIXED-NEXT: sub sp, sp, #16 +; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 +; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d +; SVE-FIXED-NEXT: index z3.s, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: str q0, [sp] +; SVE-FIXED-NEXT: xtn v1.2s, v1.2d +; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s +; SVE-FIXED-NEXT: fmov w8, s3 +; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 +; SVE-FIXED-NEXT: fmov w8, s1 +; SVE-FIXED-NEXT: ldr d0, [x9] +; SVE-FIXED-NEXT: tst w8, #0x1 +; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne +; SVE-FIXED-NEXT: add sp, sp, #16 +; SVE-FIXED-NEXT: ret + %notzero = icmp ne <2 x i64> %mask, zeroinitializer + %res = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> %data, <2 x i1> %notzero, double %passthru) + ret double %res +} + +define i8 @extract_last_i8_scalable( %data, %mask, i8 %passthru) #0 { +; CHECK-LABEL: extract_last_i8_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov z2.b, #0 // =0x0 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z2.b +; CHECK-NEXT: umaxv b1, p1, z1.b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p1.b, xzr, x8 +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: lastb w8, p1, z0.b +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( %data, %mask, i8 %passthru) + ret i8 %res +} + +define i16 @extract_last_i16_scalable( %data, %mask, i16 %passthru) #0 { +; CHECK-LABEL: extract_last_i16_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z2.h +; CHECK-NEXT: umaxv h1, p1, z1.h +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p2.h, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb w8, p2, z0.h +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: ret + %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) + ret i16 %res +} + +define i32 @extract_last_i32_scalable( %data, %mask, i32 %passthru) #0 { +; CHECK-LABEL: extract_last_i32_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s +; CHECK-NEXT: umaxv s1, p1, z1.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p2.s, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb w8, p2, z0.s +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( %data, %mask, i32 %passthru) + ret i32 %res +} + +define i64 @extract_last_i64_scalable( %data, %mask, i64 %passthru) #0 { +; CHECK-LABEL: extract_last_i64_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d +; CHECK-NEXT: umaxv d1, p1, z1.d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p2.d, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb x8, p2, z0.d +; CHECK-NEXT: csel x0, x8, x0, ne +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64( %data, %mask, i64 %passthru) + ret i64 %res +} + +define float @extract_last_float_scalable( %data, %mask, float %passthru) #0 { +; CHECK-LABEL: extract_last_float_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: mov z3.s, #0 // =0x0 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z3.s +; CHECK-NEXT: umaxv s2, p1, z2.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p2.s, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb s0, p2, z0.s +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: ret + %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32( %data, %mask, float %passthru) + ret float %res +} + +define double @extract_last_double_scalable( %data, %mask, double %passthru) #0 { +; CHECK-LABEL: extract_last_double_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: mov z3.d, #0 // =0x0 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sel z2.d, p0, z2.d, z3.d +; CHECK-NEXT: umaxv d2, p1, z2.d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p2.d, xzr, x8 +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: lastb d0, p2, z0.d +; CHECK-NEXT: fcsel d0, d0, d1, ne +; CHECK-NEXT: ret + %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64( %data, %mask, double %passthru) + ret double %res +} + +declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) +declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) +declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) +declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64) +declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float) +declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double) +declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(, , i8) +declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(, , i16) +declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(, , i32) +declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(, , i64) +declare float @llvm.experimental.vector.extract.last.active.nxv4f32(, , float) +declare double @llvm.experimental.vector.extract.last.active.nxv2f64(, , double) + +attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll b/llvm/test/CodeGen/AArch64/vector-masked-extract.ll deleted file mode 100644 index 04adf4e476b04..0000000000000 --- a/llvm/test/CodeGen/AArch64/vector-masked-extract.ll +++ /dev/null @@ -1,663 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,NEON-FIXED -; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SVE-FIXED - -define i8 @extract_last_i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) { -; NEON-FIXED-LABEL: extract_last_i8: -; NEON-FIXED: // %bb.0: -; NEON-FIXED-NEXT: sub sp, sp, #16 -; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 -; NEON-FIXED-NEXT: umov w15, v1.b[14] -; NEON-FIXED-NEXT: umov w14, v1.b[6] -; NEON-FIXED-NEXT: adrp x8, .LCPI0_0 -; NEON-FIXED-NEXT: umov w12, v1.b[15] -; NEON-FIXED-NEXT: umov w13, v1.b[10] -; NEON-FIXED-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] -; NEON-FIXED-NEXT: umov w11, v1.b[2] -; NEON-FIXED-NEXT: umov w8, v1.b[7] -; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: umov w9, v1.b[11] -; NEON-FIXED-NEXT: umov w10, v1.b[3] -; NEON-FIXED-NEXT: umov w16, v1.b[12] -; NEON-FIXED-NEXT: fmov s3, w15 -; NEON-FIXED-NEXT: umov w15, v1.b[4] -; NEON-FIXED-NEXT: fmov s4, w14 -; NEON-FIXED-NEXT: fmov s5, w13 -; NEON-FIXED-NEXT: umov w13, v1.b[0] -; NEON-FIXED-NEXT: umov w14, v1.b[13] -; NEON-FIXED-NEXT: fmov s6, w11 -; NEON-FIXED-NEXT: umov w11, v1.b[5] -; NEON-FIXED-NEXT: mov v3.s[1], w12 -; NEON-FIXED-NEXT: umov w12, v1.b[8] -; NEON-FIXED-NEXT: mov v4.s[1], w8 -; NEON-FIXED-NEXT: umov w8, v1.b[9] -; NEON-FIXED-NEXT: mov v5.s[1], w9 -; NEON-FIXED-NEXT: umov w9, v1.b[1] -; NEON-FIXED-NEXT: fmov s7, w16 -; NEON-FIXED-NEXT: fmov s16, w15 -; NEON-FIXED-NEXT: mov v6.s[1], w10 -; NEON-FIXED-NEXT: fmov s18, w13 -; NEON-FIXED-NEXT: shl v1.16b, v1.16b, #7 -; NEON-FIXED-NEXT: fmov s17, w12 -; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0 -; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0 -; NEON-FIXED-NEXT: mov v7.s[1], w14 -; NEON-FIXED-NEXT: mov v16.s[1], w11 -; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0 -; NEON-FIXED-NEXT: mov v18.s[1], w9 -; NEON-FIXED-NEXT: adrp x9, .LCPI0_2 -; NEON-FIXED-NEXT: ushll v6.2d, v6.2s, #0 -; NEON-FIXED-NEXT: ldr q20, [x9, :lo12:.LCPI0_2] -; NEON-FIXED-NEXT: adrp x9, .LCPI0_7 -; NEON-FIXED-NEXT: mov v17.s[1], w8 -; NEON-FIXED-NEXT: adrp x8, .LCPI0_1 -; NEON-FIXED-NEXT: ldr q23, [x9, :lo12:.LCPI0_7] -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_1] -; NEON-FIXED-NEXT: adrp x8, .LCPI0_3 -; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63 -; NEON-FIXED-NEXT: ushll v7.2d, v7.2s, #0 -; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63 -; NEON-FIXED-NEXT: ushll v16.2d, v16.2s, #0 -; NEON-FIXED-NEXT: ushll v17.2d, v17.2s, #0 -; NEON-FIXED-NEXT: shl v6.2d, v6.2d, #63 -; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; NEON-FIXED-NEXT: ushll v18.2d, v18.2s, #0 -; NEON-FIXED-NEXT: cmlt v1.16b, v1.16b, #0 -; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 -; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 -; NEON-FIXED-NEXT: cmlt v6.2d, v6.2d, #0 -; NEON-FIXED-NEXT: and v2.16b, v3.16b, v2.16b -; NEON-FIXED-NEXT: shl v3.2d, v7.2d, #63 -; NEON-FIXED-NEXT: shl v7.2d, v16.2d, #63 -; NEON-FIXED-NEXT: shl v16.2d, v17.2d, #63 -; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI0_3] -; NEON-FIXED-NEXT: adrp x8, .LCPI0_4 -; NEON-FIXED-NEXT: ldr q21, [x8, :lo12:.LCPI0_4] -; NEON-FIXED-NEXT: adrp x8, .LCPI0_5 -; NEON-FIXED-NEXT: shl v18.2d, v18.2d, #63 -; NEON-FIXED-NEXT: ldr q22, [x8, :lo12:.LCPI0_5] -; NEON-FIXED-NEXT: adrp x8, .LCPI0_6 -; NEON-FIXED-NEXT: and v4.16b, v4.16b, v19.16b -; NEON-FIXED-NEXT: ldr q19, [x8, :lo12:.LCPI0_6] -; NEON-FIXED-NEXT: cmlt v16.2d, v16.2d, #0 -; NEON-FIXED-NEXT: and v5.16b, v5.16b, v20.16b -; NEON-FIXED-NEXT: cmlt v18.2d, v18.2d, #0 -; NEON-FIXED-NEXT: and v6.16b, v6.16b, v17.16b -; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; NEON-FIXED-NEXT: cmlt v7.2d, v7.2d, #0 -; NEON-FIXED-NEXT: umaxv b1, v1.16b -; NEON-FIXED-NEXT: and v16.16b, v16.16b, v19.16b -; NEON-FIXED-NEXT: and v17.16b, v18.16b, v23.16b -; NEON-FIXED-NEXT: cmhi v18.2d, v4.2d, v2.2d -; NEON-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d -; NEON-FIXED-NEXT: and v3.16b, v3.16b, v21.16b -; NEON-FIXED-NEXT: and v7.16b, v7.16b, v22.16b -; NEON-FIXED-NEXT: cmhi v21.2d, v17.2d, v16.2d -; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v18.16b -; NEON-FIXED-NEXT: mov v4.16b, v19.16b -; NEON-FIXED-NEXT: cmhi v20.2d, v7.2d, v3.2d -; NEON-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b -; NEON-FIXED-NEXT: mov v5.16b, v21.16b -; NEON-FIXED-NEXT: bit v3.16b, v7.16b, v20.16b -; NEON-FIXED-NEXT: bsl v5.16b, v17.16b, v16.16b -; NEON-FIXED-NEXT: cmhi v6.2d, v4.2d, v2.2d -; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v3.2d -; NEON-FIXED-NEXT: bit v2.16b, v4.16b, v6.16b -; NEON-FIXED-NEXT: bit v3.16b, v5.16b, v7.16b -; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d -; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b -; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; NEON-FIXED-NEXT: cmhi d4, d2, d3 -; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b -; NEON-FIXED-NEXT: fmov x8, d2 -; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4 -; NEON-FIXED-NEXT: ldrb w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne -; NEON-FIXED-NEXT: add sp, sp, #16 -; NEON-FIXED-NEXT: ret -; -; SVE-FIXED-LABEL: extract_last_i8: -; SVE-FIXED: // %bb.0: -; SVE-FIXED-NEXT: sub sp, sp, #16 -; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 -; SVE-FIXED-NEXT: umov w8, v1.b[14] -; SVE-FIXED-NEXT: umov w9, v1.b[6] -; SVE-FIXED-NEXT: index z2.d, #0, #1 -; SVE-FIXED-NEXT: umov w12, v1.b[2] -; SVE-FIXED-NEXT: umov w10, v1.b[10] -; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: umov w13, v1.b[12] -; SVE-FIXED-NEXT: umov w11, v1.b[15] -; SVE-FIXED-NEXT: umov w14, v1.b[4] -; SVE-FIXED-NEXT: umov w16, v1.b[0] -; SVE-FIXED-NEXT: umov w15, v1.b[8] -; SVE-FIXED-NEXT: fmov s3, w8 -; SVE-FIXED-NEXT: umov w8, v1.b[7] -; SVE-FIXED-NEXT: fmov s4, w9 -; SVE-FIXED-NEXT: umov w9, v1.b[11] -; SVE-FIXED-NEXT: fmov s6, w12 -; SVE-FIXED-NEXT: umov w12, v1.b[3] -; SVE-FIXED-NEXT: fmov s5, w10 -; SVE-FIXED-NEXT: umov w10, v1.b[1] -; SVE-FIXED-NEXT: fmov s7, w13 -; SVE-FIXED-NEXT: umov w13, v1.b[13] -; SVE-FIXED-NEXT: fmov s16, w14 -; SVE-FIXED-NEXT: fmov s18, w16 -; SVE-FIXED-NEXT: mov v4.s[1], w8 -; SVE-FIXED-NEXT: umov w8, v1.b[5] -; SVE-FIXED-NEXT: mov v3.s[1], w11 -; SVE-FIXED-NEXT: mov v5.s[1], w9 -; SVE-FIXED-NEXT: mov v6.s[1], w12 -; SVE-FIXED-NEXT: umov w9, v1.b[9] -; SVE-FIXED-NEXT: fmov s17, w15 -; SVE-FIXED-NEXT: mov v18.s[1], w10 -; SVE-FIXED-NEXT: mov z19.d, z2.d -; SVE-FIXED-NEXT: mov v7.s[1], w13 -; SVE-FIXED-NEXT: mov z20.d, z2.d -; SVE-FIXED-NEXT: mov z21.d, z2.d -; SVE-FIXED-NEXT: mov v16.s[1], w8 -; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0 -; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0 -; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0 -; SVE-FIXED-NEXT: ushll v6.2d, v6.2s, #0 -; SVE-FIXED-NEXT: mov v17.s[1], w9 -; SVE-FIXED-NEXT: mov x9, sp -; SVE-FIXED-NEXT: ushll v18.2d, v18.2s, #0 -; SVE-FIXED-NEXT: mov z25.d, z2.d -; SVE-FIXED-NEXT: ushll v7.2d, v7.2s, #0 -; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63 -; SVE-FIXED-NEXT: ushll v16.2d, v16.2s, #0 -; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63 -; SVE-FIXED-NEXT: shl v6.2d, v6.2d, #63 -; SVE-FIXED-NEXT: mov z22.d, z2.d -; SVE-FIXED-NEXT: mov z23.d, z2.d -; SVE-FIXED-NEXT: add z19.d, z19.d, #6 // =0x6 -; SVE-FIXED-NEXT: shl v18.2d, v18.2d, #63 -; SVE-FIXED-NEXT: ushll v17.2d, v17.2s, #0 -; SVE-FIXED-NEXT: shl v7.2d, v7.2d, #63 -; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 -; SVE-FIXED-NEXT: add z25.d, z25.d, #14 // =0xe -; SVE-FIXED-NEXT: shl v16.2d, v16.2d, #63 -; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 -; SVE-FIXED-NEXT: add z20.d, z20.d, #10 // =0xa -; SVE-FIXED-NEXT: cmlt v6.2d, v6.2d, #0 -; SVE-FIXED-NEXT: add z21.d, z21.d, #2 // =0x2 -; SVE-FIXED-NEXT: mov z24.d, z2.d -; SVE-FIXED-NEXT: shl v17.2d, v17.2d, #63 -; SVE-FIXED-NEXT: cmlt v18.2d, v18.2d, #0 -; SVE-FIXED-NEXT: cmlt v7.2d, v7.2d, #0 -; SVE-FIXED-NEXT: add z22.d, z22.d, #12 // =0xc -; SVE-FIXED-NEXT: cmlt v16.2d, v16.2d, #0 -; SVE-FIXED-NEXT: add z23.d, z23.d, #4 // =0x4 -; SVE-FIXED-NEXT: and v3.16b, v3.16b, v25.16b -; SVE-FIXED-NEXT: and v4.16b, v4.16b, v19.16b -; SVE-FIXED-NEXT: and v5.16b, v5.16b, v20.16b -; SVE-FIXED-NEXT: and v6.16b, v6.16b, v21.16b -; SVE-FIXED-NEXT: cmlt v17.2d, v17.2d, #0 -; SVE-FIXED-NEXT: add z24.d, z24.d, #8 // =0x8 -; SVE-FIXED-NEXT: and v2.16b, v18.16b, v2.16b -; SVE-FIXED-NEXT: and v7.16b, v7.16b, v22.16b -; SVE-FIXED-NEXT: and v16.16b, v16.16b, v23.16b -; SVE-FIXED-NEXT: cmhi v18.2d, v4.2d, v3.2d -; SVE-FIXED-NEXT: shl v1.16b, v1.16b, #7 -; SVE-FIXED-NEXT: cmhi v19.2d, v6.2d, v5.2d -; SVE-FIXED-NEXT: and v17.16b, v17.16b, v24.16b -; SVE-FIXED-NEXT: cmhi v20.2d, v16.2d, v7.2d -; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v18.16b -; SVE-FIXED-NEXT: cmlt v1.16b, v1.16b, #0 -; SVE-FIXED-NEXT: mov v4.16b, v19.16b -; SVE-FIXED-NEXT: cmhi v21.2d, v2.2d, v17.2d -; SVE-FIXED-NEXT: umaxv b1, v1.16b -; SVE-FIXED-NEXT: bsl v4.16b, v6.16b, v5.16b -; SVE-FIXED-NEXT: mov v5.16b, v20.16b -; SVE-FIXED-NEXT: bif v2.16b, v17.16b, v21.16b -; SVE-FIXED-NEXT: bsl v5.16b, v16.16b, v7.16b -; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d -; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d -; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b -; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b -; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d -; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b -; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; SVE-FIXED-NEXT: cmhi d4, d2, d3 -; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b -; SVE-FIXED-NEXT: fmov x8, d2 -; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4 -; SVE-FIXED-NEXT: ldrb w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne -; SVE-FIXED-NEXT: add sp, sp, #16 -; SVE-FIXED-NEXT: ret - %res = call i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %mask, i8 %passthru) - ret i8 %res -} - -define i16 @extract_last_i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) { -; NEON-FIXED-LABEL: extract_last_i16: -; NEON-FIXED: // %bb.0: -; NEON-FIXED-NEXT: sub sp, sp, #16 -; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 -; NEON-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1 -; NEON-FIXED-NEXT: umov w8, v1.b[6] -; NEON-FIXED-NEXT: umov w9, v1.b[2] -; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: umov w11, v1.b[4] -; NEON-FIXED-NEXT: umov w12, v1.b[0] -; NEON-FIXED-NEXT: umov w10, v1.b[7] -; NEON-FIXED-NEXT: umov w13, v1.b[3] -; NEON-FIXED-NEXT: umov w14, v1.b[5] -; NEON-FIXED-NEXT: umov w15, v1.b[1] -; NEON-FIXED-NEXT: shl v1.8b, v1.8b, #7 -; NEON-FIXED-NEXT: fmov s2, w8 -; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 -; NEON-FIXED-NEXT: fmov s3, w9 -; NEON-FIXED-NEXT: fmov s4, w11 -; NEON-FIXED-NEXT: adrp x9, .LCPI1_1 -; NEON-FIXED-NEXT: ldr q6, [x8, :lo12:.LCPI1_0] -; NEON-FIXED-NEXT: fmov s5, w12 -; NEON-FIXED-NEXT: adrp x8, .LCPI1_3 -; NEON-FIXED-NEXT: ldr q7, [x9, :lo12:.LCPI1_1] -; NEON-FIXED-NEXT: mov v2.s[1], w10 -; NEON-FIXED-NEXT: mov v3.s[1], w13 -; NEON-FIXED-NEXT: adrp x10, .LCPI1_2 -; NEON-FIXED-NEXT: mov v4.s[1], w14 -; NEON-FIXED-NEXT: ldr q16, [x10, :lo12:.LCPI1_2] -; NEON-FIXED-NEXT: ldr q17, [x8, :lo12:.LCPI1_3] -; NEON-FIXED-NEXT: mov v5.s[1], w15 -; NEON-FIXED-NEXT: cmlt v1.8b, v1.8b, #0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0 -; NEON-FIXED-NEXT: ushll v3.2d, v3.2s, #0 -; NEON-FIXED-NEXT: ushll v4.2d, v4.2s, #0 -; NEON-FIXED-NEXT: umaxv b1, v1.8b -; NEON-FIXED-NEXT: ushll v5.2d, v5.2s, #0 -; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63 -; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; NEON-FIXED-NEXT: shl v4.2d, v4.2d, #63 -; NEON-FIXED-NEXT: shl v5.2d, v5.2d, #63 -; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 -; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; NEON-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 -; NEON-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 -; NEON-FIXED-NEXT: and v2.16b, v2.16b, v6.16b -; NEON-FIXED-NEXT: and v3.16b, v3.16b, v7.16b -; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b -; NEON-FIXED-NEXT: and v5.16b, v5.16b, v17.16b -; NEON-FIXED-NEXT: cmhi v6.2d, v3.2d, v2.2d -; NEON-FIXED-NEXT: cmhi v7.2d, v5.2d, v4.2d -; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v6.16b -; NEON-FIXED-NEXT: mov v3.16b, v7.16b -; NEON-FIXED-NEXT: bsl v3.16b, v5.16b, v4.16b -; NEON-FIXED-NEXT: cmhi v4.2d, v3.2d, v2.2d -; NEON-FIXED-NEXT: bit v2.16b, v3.16b, v4.16b -; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; NEON-FIXED-NEXT: cmhi d4, d2, d3 -; NEON-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b -; NEON-FIXED-NEXT: fmov x8, d2 -; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 -; NEON-FIXED-NEXT: ldrh w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne -; NEON-FIXED-NEXT: add sp, sp, #16 -; NEON-FIXED-NEXT: ret -; -; SVE-FIXED-LABEL: extract_last_i16: -; SVE-FIXED: // %bb.0: -; SVE-FIXED-NEXT: sub sp, sp, #16 -; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 -; SVE-FIXED-NEXT: // kill: def $d1 killed $d1 def $q1 -; SVE-FIXED-NEXT: umov w8, v1.b[0] -; SVE-FIXED-NEXT: umov w10, v1.b[6] -; SVE-FIXED-NEXT: index z6.d, #0, #1 -; SVE-FIXED-NEXT: umov w11, v1.b[2] -; SVE-FIXED-NEXT: umov w14, v1.b[4] -; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: umov w9, v1.b[1] -; SVE-FIXED-NEXT: umov w12, v1.b[7] -; SVE-FIXED-NEXT: umov w13, v1.b[3] -; SVE-FIXED-NEXT: fmov s2, w8 -; SVE-FIXED-NEXT: umov w8, v1.b[5] -; SVE-FIXED-NEXT: fmov s3, w10 -; SVE-FIXED-NEXT: fmov s4, w11 -; SVE-FIXED-NEXT: fmov s5, w14 -; SVE-FIXED-NEXT: mov z7.d, z6.d -; SVE-FIXED-NEXT: mov z16.d, z6.d -; SVE-FIXED-NEXT: mov z17.d, z6.d -; SVE-FIXED-NEXT: shl v1.8b, v1.8b, #7 -; SVE-FIXED-NEXT: mov v2.s[1], w9 -; SVE-FIXED-NEXT: mov x9, sp -; SVE-FIXED-NEXT: mov v3.s[1], w12 -; SVE-FIXED-NEXT: mov v4.s[1], w13 -; SVE-FIXED-NEXT: mov v5.s[1], w8 -; SVE-FIXED-NEXT: add z7.d, z7.d, #2 // =0x2 -; SVE-FIXED-NEXT: add z17.d, z17.d, #6 // =0x6 -; SVE-FIXED-NEXT: add z16.d, z16.d, #4 // =0x4 -; SVE-FIXED-NEXT: cmlt v1.8b, v1.8b, #0 -; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0 -; SVE-FIXED-NEXT: ushll v3.2d, v3.2s, #0 -; SVE-FIXED-NEXT: ushll v4.2d, v4.2s, #0 -; SVE-FIXED-NEXT: ushll v5.2d, v5.2s, #0 -; SVE-FIXED-NEXT: umaxv b1, v1.8b -; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63 -; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; SVE-FIXED-NEXT: shl v4.2d, v4.2d, #63 -; SVE-FIXED-NEXT: shl v5.2d, v5.2d, #63 -; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 -; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; SVE-FIXED-NEXT: cmlt v4.2d, v4.2d, #0 -; SVE-FIXED-NEXT: cmlt v5.2d, v5.2d, #0 -; SVE-FIXED-NEXT: and v2.16b, v2.16b, v6.16b -; SVE-FIXED-NEXT: and v3.16b, v3.16b, v17.16b -; SVE-FIXED-NEXT: and v4.16b, v4.16b, v7.16b -; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b -; SVE-FIXED-NEXT: cmhi v6.2d, v4.2d, v3.2d -; SVE-FIXED-NEXT: cmhi v7.2d, v2.2d, v5.2d -; SVE-FIXED-NEXT: bit v3.16b, v4.16b, v6.16b -; SVE-FIXED-NEXT: bif v2.16b, v5.16b, v7.16b -; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d -; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b -; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; SVE-FIXED-NEXT: cmhi d4, d2, d3 -; SVE-FIXED-NEXT: bif v2.8b, v3.8b, v4.8b -; SVE-FIXED-NEXT: fmov x8, d2 -; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 -; SVE-FIXED-NEXT: ldrh w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne -; SVE-FIXED-NEXT: add sp, sp, #16 -; SVE-FIXED-NEXT: ret - %res = call i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %mask, i16 %passthru) - ret i16 %res -} - -define i32 @extract_last_i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) { -; NEON-FIXED-LABEL: extract_last_i32: -; NEON-FIXED: // %bb.0: -; NEON-FIXED-NEXT: sub sp, sp, #16 -; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 -; NEON-FIXED-NEXT: ushll v2.4s, v1.4h, #0 -; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 -; NEON-FIXED-NEXT: adrp x9, .LCPI2_1 -; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] -; NEON-FIXED-NEXT: ldr q5, [x9, :lo12:.LCPI2_1] -; NEON-FIXED-NEXT: shl v1.4h, v1.4h, #15 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0 -; NEON-FIXED-NEXT: ushll v2.2d, v2.2s, #0 -; NEON-FIXED-NEXT: cmlt v1.4h, v1.4h, #0 -; NEON-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; NEON-FIXED-NEXT: shl v2.2d, v2.2d, #63 -; NEON-FIXED-NEXT: umaxv h1, v1.4h -; NEON-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; NEON-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 -; NEON-FIXED-NEXT: and v3.16b, v3.16b, v4.16b -; NEON-FIXED-NEXT: and v2.16b, v2.16b, v5.16b -; NEON-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d -; NEON-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b -; NEON-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b -; NEON-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; NEON-FIXED-NEXT: cmhi d4, d3, d2 -; NEON-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b -; NEON-FIXED-NEXT: fmov x8, d2 -; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 -; NEON-FIXED-NEXT: ldr w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne -; NEON-FIXED-NEXT: add sp, sp, #16 -; NEON-FIXED-NEXT: ret -; -; SVE-FIXED-LABEL: extract_last_i32: -; SVE-FIXED: // %bb.0: -; SVE-FIXED-NEXT: sub sp, sp, #16 -; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 -; SVE-FIXED-NEXT: ushll v2.4s, v1.4h, #0 -; SVE-FIXED-NEXT: index z4.d, #0, #1 -; SVE-FIXED-NEXT: shl v1.4h, v1.4h, #15 -; SVE-FIXED-NEXT: mov x9, sp -; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: ushll2 v3.2d, v2.4s, #0 -; SVE-FIXED-NEXT: ushll v2.2d, v2.2s, #0 -; SVE-FIXED-NEXT: cmlt v1.4h, v1.4h, #0 -; SVE-FIXED-NEXT: mov z5.d, z4.d -; SVE-FIXED-NEXT: shl v3.2d, v3.2d, #63 -; SVE-FIXED-NEXT: shl v2.2d, v2.2d, #63 -; SVE-FIXED-NEXT: umaxv h1, v1.4h -; SVE-FIXED-NEXT: add z5.d, z5.d, #2 // =0x2 -; SVE-FIXED-NEXT: cmlt v3.2d, v3.2d, #0 -; SVE-FIXED-NEXT: cmlt v2.2d, v2.2d, #0 -; SVE-FIXED-NEXT: and v2.16b, v2.16b, v4.16b -; SVE-FIXED-NEXT: and v3.16b, v3.16b, v5.16b -; SVE-FIXED-NEXT: cmhi v4.2d, v2.2d, v3.2d -; SVE-FIXED-NEXT: bif v2.16b, v3.16b, v4.16b -; SVE-FIXED-NEXT: bic v3.16b, v3.16b, v4.16b -; SVE-FIXED-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; SVE-FIXED-NEXT: cmhi d4, d3, d2 -; SVE-FIXED-NEXT: bit v2.8b, v3.8b, v4.8b -; SVE-FIXED-NEXT: fmov x8, d2 -; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 -; SVE-FIXED-NEXT: ldr w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne -; SVE-FIXED-NEXT: add sp, sp, #16 -; SVE-FIXED-NEXT: ret - %res = call i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %mask, i32 %passthru) - ret i32 %res -} - -define i64 @extract_last_i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) { -; CHECK-LABEL: extract_last_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: fmov d2, xzr -; CHECK-NEXT: fmov d4, x8 -; CHECK-NEXT: shl v1.2s, v1.2s, #31 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: shl v3.2d, v3.2d, #63 -; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 -; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 -; CHECK-NEXT: umaxp v1.2s, v1.2s, v1.2s -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: and v3.8b, v3.8b, v4.8b -; CHECK-NEXT: cmhi d2, d2, d3 -; CHECK-NEXT: bic v2.8b, v3.8b, v2.8b -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: orr x8, x9, x8, lsl #3 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: csel x0, x8, x0, ne -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %mask, i64 %passthru) - ret i64 %res -} - -define i8 @extract_last_i8_scalable( %data, %mask, i8 %passthru) #0 { -; CHECK-LABEL: extract_last_i8_scalable: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: punpklo p2.h, p0.b -; CHECK-NEXT: mov z3.d, #0 // =0x0 -; CHECK-NEXT: punpkhi p4.h, p0.b -; CHECK-NEXT: punpklo p5.h, p2.b -; CHECK-NEXT: punpkhi p1.h, p4.b -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z5.d, z1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: punpkhi p3.h, p2.b -; CHECK-NEXT: punpklo p2.h, p4.b -; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z5.d, all, mul #2 -; CHECK-NEXT: punpklo p4.h, p5.b -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: punpkhi p6.h, p1.b -; CHECK-NEXT: punpkhi p7.h, p3.b -; CHECK-NEXT: sel z1.d, p4, z1.d, z3.d -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: mov z7.d, z2.d -; CHECK-NEXT: mov z25.d, z5.d -; CHECK-NEXT: punpkhi p5.h, p5.b -; CHECK-NEXT: punpkhi p4.h, p2.b -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z25.d, all, mul #4 -; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: punpklo p3.h, p3.b -; CHECK-NEXT: sel z2.d, p5, z2.d, z3.d -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: punpklo p2.h, p2.b -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: sel z5.d, p3, z5.d, z3.d -; CHECK-NEXT: sel z4.d, p7, z4.d, z3.d -; CHECK-NEXT: sel z6.d, p2, z6.d, z3.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z25.d, p1, z25.d, z3.d -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z6.d -; CHECK-NEXT: sel z24.d, p6, z24.d, z3.d -; CHECK-NEXT: mov z3.d, p4/m, z7.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: umax z4.d, p1/m, z4.d, z24.d -; CHECK-NEXT: umax z2.d, p1/m, z2.d, z3.d -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: umax z3.d, p1/m, z3.d, z25.d -; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d -; CHECK-NEXT: umaxv d1, p1, z1.d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: whilels p1.b, xzr, x8 -; CHECK-NEXT: ptest p0, p0.b -; CHECK-NEXT: lastb w8, p1, z0.b -; CHECK-NEXT: csel w0, w8, w0, ne -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %res = call i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8( %data, %mask, i8 %passthru) - ret i8 %res -} - -define i16 @extract_last_i16_scalable( %data, %mask, i16 %passthru) #0 { -; CHECK-LABEL: extract_last_i16_scalable: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: punpkhi p1.h, p0.b -; CHECK-NEXT: mov z5.d, #0 // =0x0 -; CHECK-NEXT: punpklo p2.h, p0.b -; CHECK-NEXT: punpkhi p3.h, p1.b -; CHECK-NEXT: punpkhi p4.h, p2.b -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p2.h, p2.b -; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: sel z1.d, p2, z1.d, z5.d -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: sel z2.d, p4, z2.d, z5.d -; CHECK-NEXT: sel z3.d, p1, z3.d, z5.d -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z3.d -; CHECK-NEXT: sel z4.d, p3, z4.d, z5.d -; CHECK-NEXT: umax z2.d, p1/m, z2.d, z4.d -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d -; CHECK-NEXT: umaxv d1, p1, z1.d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: whilels p1.h, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.h -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: ptest p1, p0.b -; CHECK-NEXT: csel w0, w8, w0, ne -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %res = call i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) - ret i16 %res -} - -define i32 @extract_last_i32_scalable( %data, %mask, i32 %passthru) #0 { -; CHECK-LABEL: extract_last_i32_scalable: -; CHECK: // %bb.0: -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: mov z3.d, #0 // =0x0 -; CHECK-NEXT: punpkhi p1.h, p0.b -; CHECK-NEXT: punpklo p2.h, p0.b -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: sel z1.d, p2, z1.d, z3.d -; CHECK-NEXT: incd z2.d -; CHECK-NEXT: sel z2.d, p1, z2.d, z3.d -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: umax z1.d, p1/m, z1.d, z2.d -; CHECK-NEXT: umaxv d1, p1, z1.d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: whilels p1.s, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.s -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: ptest p1, p0.b -; CHECK-NEXT: csel w0, w8, w0, ne -; CHECK-NEXT: ret - %res = call i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32( %data, %mask, i32 %passthru) - ret i32 %res -} - -define i64 @extract_last_i64_scalable( %data, %mask, i64 %passthru) #0 { -; CHECK-LABEL: extract_last_i64_scalable: -; CHECK: // %bb.0: -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: mov z2.d, #0 // =0x0 -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d -; CHECK-NEXT: umaxv d1, p1, z1.d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: whilels p2.d, xzr, x8 -; CHECK-NEXT: ptest p1, p0.b -; CHECK-NEXT: lastb x8, p2, z0.d -; CHECK-NEXT: csel x0, x8, x0, ne -; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64( %data, %mask, i64 %passthru) - ret i64 %res -} - -declare i8 @llvm.experimental.vector.masked.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) -declare i16 @llvm.experimental.vector.masked.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) -declare i32 @llvm.experimental.vector.masked.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) -declare i64 @llvm.experimental.vector.masked.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64) -declare i8 @llvm.experimental.vector.masked.extract.last.active.nxv16i8(, , i8) -declare i16 @llvm.experimental.vector.masked.extract.last.active.nxv8i16(, , i16) -declare i32 @llvm.experimental.vector.masked.extract.last.active.nxv4i32(, , i32) -declare i64 @llvm.experimental.vector.masked.extract.last.active.nxv2i64(, , i64) - -attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } From 9a6454efdf4b865fe47e897683c2c25a9dc837ba Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 14 Nov 2024 13:44:45 +0000 Subject: [PATCH 3/3] Move lowering code to dedicated function --- .../SelectionDAG/SelectionDAGBuilder.cpp | 81 ++++++++++--------- .../SelectionDAG/SelectionDAGBuilder.h | 1 + 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 600905421a357..9d729d448502d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6415,6 +6415,50 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I, DAG.setRoot(Histogram); } +void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, + unsigned Intrinsic) { + assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active && + "Tried lowering invalid vector extract last"); + SDLoc sdl = getCurSDLoc(); + SDValue Data = getValue(I.getOperand(0)); + SDValue Mask = getValue(I.getOperand(1)); + SDValue PassThru = getValue(I.getOperand(2)); + + EVT DataVT = Data.getValueType(); + EVT ScalarVT = PassThru.getValueType(); + EVT BoolVT = Mask.getValueType().getScalarType(); + + // Find a suitable type for a stepvector. + ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. + if (DataVT.isScalableVector()) + VScaleRange = getVScaleRange(I.getCaller(), 64); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned EltWidth = TLI.getBitWidthForCttzElements( + I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true, + &VScaleRange); + MVT StepVT = MVT::getIntegerVT(EltWidth); + EVT StepVecVT = DataVT.changeVectorElementType(StepVT); + + // Zero out lanes with inactive elements, then find the highest remaining + // value from the stepvector. + SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT); + SDValue StepVec = DAG.getStepVector(sdl, StepVecVT); + SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes); + SDValue HighestIdx = + DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts); + + // Extract the corresponding lane from the data vector + EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT); + SDValue Extract = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx); + + // If all mask lanes were inactive, choose the passthru value instead. + SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); + SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru); + setValue(&I, Result); +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -8237,42 +8281,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_vector_extract_last_active: { - SDValue Data = getValue(I.getOperand(0)); - SDValue Mask = getValue(I.getOperand(1)); - SDValue PassThru = getValue(I.getOperand(2)); - - EVT DataVT = Data.getValueType(); - EVT ScalarVT = PassThru.getValueType(); - EVT BoolVT = Mask.getValueType().getScalarType(); - - // Find a suitable type for a stepvector. - ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. - if (DataVT.isScalableVector()) - VScaleRange = getVScaleRange(I.getCaller(), 64); - unsigned EltWidth = TLI.getBitWidthForCttzElements( - I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true, - &VScaleRange); - MVT StepVT = MVT::getIntegerVT(EltWidth); - EVT StepVecVT = DataVT.changeVectorElementType(StepVT); - - // Zero out lanes with inactive elements, then find the highest remaining - // value from the stepvector. - SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT); - SDValue StepVec = DAG.getStepVector(sdl, StepVecVT); - SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes); - SDValue HighestIdx = - DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts); - - // Extract the corresponding lane from the data vector - EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT); - SDValue Extract = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx); - - // If all mask lanes were inactive, choose the passthru value instead. - SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); - SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru); - setValue(&I, Result); + visitVectorExtractLastActive(I, Intrinsic); return; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 3f8a3e7ffb65b..3a8dc25e98700 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -629,6 +629,7 @@ class SelectionDAGBuilder { void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitConvergenceControl(const CallInst &I, unsigned Intrinsic); void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID); + void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin,