diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1f98d69edb473..bfbcc14baf18c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13402,30 +13402,6 @@ static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { return true; } -/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in -/// the first vector operand. -static std::optional isDUPQMask(ArrayRef M, EVT VT) { - assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size"); - unsigned Lane = (unsigned)M[0]; - unsigned Segments = VT.getFixedSizeInBits() / 128; - unsigned SegmentElts = VT.getVectorNumElements() / Segments; - - // Make sure there's no size changes. - if (SegmentElts * Segments != M.size()) - return std::nullopt; - - // Check the first index corresponds to one of the lanes in the first segment. - if (Lane >= SegmentElts) - return std::nullopt; - - // Check that all lanes match the first, adjusted for segment. - for (unsigned I = 0; I < M.size(); ++I) - if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts))) - return std::nullopt; - - return Lane; -} - /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. @@ -30026,8 +30002,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } - if (Subtarget->hasSVE2p1()) { - if (std::optional Lane = isDUPQMask(ShuffleMask, VT)) { + if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) && + Subtarget->isSVEorStreamingSVEAvailable()) { + assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && + "Unsupported SVE vector size"); + + unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock; + unsigned SegmentElts = VT.getVectorNumElements() / Segments; + if (std::optional Lane = + isDUPQMask(ShuffleMask, Segments, SegmentElts)) { SDValue IID = DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); return convertFromScalableVector( diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 7b044cf7c238f..e9bc6d947b0d9 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" namespace llvm { @@ -6723,6 +6724,32 @@ inline bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, return true; } +/// isDUPQMask - matches a splat of equivalent lanes within segments of a given +/// number of elements. +inline std::optional isDUPQMask(ArrayRef Mask, unsigned Segments, + unsigned SegmentSize) { + unsigned Lane = unsigned(Mask[0]); + + // Make sure there's no size changes. + if (SegmentSize * Segments != Mask.size()) + return std::nullopt; + + // Check the first index corresponds to one of the lanes in the first segment. + if (Lane >= SegmentSize) + return std::nullopt; + + // Check that all lanes match the first, adjusted for segment. + // Undef/poison lanes (<0) are also accepted. + if (all_of(enumerate(Mask), [&](auto P) { + const unsigned SegmentIndex = P.index() / SegmentSize; + return P.value() < 0 || + unsigned(P.value()) == Lane + SegmentIndex * SegmentSize; + })) + return Lane; + + return std::nullopt; +} + } // namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 15e38e6cb2408..3387dee8aa4c8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5599,6 +5599,23 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, SrcTy = DstTy; } + // Segmented shuffle matching. + if ((ST->hasSVE2p1() || ST->hasSME2p1()) && + ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc && + isa(SrcTy) && !Mask.empty() && + SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( + AArch64::SVEBitsPerBlock)) { + + FixedVectorType *VTy = cast(SrcTy); + unsigned Segments = + VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock; + unsigned SegmentElts = VTy->getNumElements() / Segments; + + // dupq zd.t, zn.t[idx] + if (isDUPQMask(Mask, Segments, SegmentElts)) + return LT.first; + } + // Check for broadcast loads, which are supported by the LD1R instruction. // In terms of code-size, the shuffle vector is free when a load + dup get // folded into a LD1R. That's what we check and return here. For performance diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll new file mode 100644 index 0000000000000..790f49f1d3b82 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s + +;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx]) +define void @dup_within_each_segment_256b() #0 { +; CHECK-LABEL: 'dup_within_each_segment_256b' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> + %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + ret void +} + +define void @dup_within_each_segment_512b() #1 { +; CHECK-LABEL: 'dup_within_each_segment_512b' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> + %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + ret void +} + +attributes #0 = { noinline vscale_range(2,2) } +attributes #1 = { noinline vscale_range(4,4) } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll index 40d4d0ff60148..da83b27ce4d55 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME define void @dupq_i8_256b(ptr %addr) #0 { ; CHECK-LABEL: dupq_i8_256b: @@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 { } define void @dupq_bf16_256b(ptr %addr) #0 { -; CHECK-LABEL: dupq_bf16_256b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: dup v0.8h, v0.h[2] -; CHECK-NEXT: dup v1.8h, v1.h[2] -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: dupq_bf16_256b: +; SVE: // %bb.0: +; SVE-NEXT: ldp q0, q1, [x0] +; SVE-NEXT: dup v0.8h, v0.h[2] +; SVE-NEXT: dup v1.8h, v1.h[2] +; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: ret +; +; SME-LABEL: dupq_bf16_256b: +; SME: // %bb.0: +; SME-NEXT: ldp q1, q0, [x0] +; SME-NEXT: str q0, [sp, #-64]! +; SME-NEXT: .cfi_def_cfa_offset 64 +; SME-NEXT: ldr h0, [sp, #4] +; SME-NEXT: str q1, [sp, #32] +; SME-NEXT: str h0, [sp, #30] +; SME-NEXT: str h0, [sp, #28] +; SME-NEXT: str h0, [sp, #26] +; SME-NEXT: str h0, [sp, #24] +; SME-NEXT: str h0, [sp, #22] +; SME-NEXT: str h0, [sp, #20] +; SME-NEXT: str h0, [sp, #18] +; SME-NEXT: str h0, [sp, #16] +; SME-NEXT: ldr h0, [sp, #36] +; SME-NEXT: ldr q1, [sp, #16] +; SME-NEXT: str h0, [sp, #62] +; SME-NEXT: str h0, [sp, #60] +; SME-NEXT: str h0, [sp, #58] +; SME-NEXT: str h0, [sp, #56] +; SME-NEXT: str h0, [sp, #54] +; SME-NEXT: str h0, [sp, #52] +; SME-NEXT: str h0, [sp, #50] +; SME-NEXT: str h0, [sp, #48] +; SME-NEXT: ldr q0, [sp, #48] +; SME-NEXT: stp q0, q1, [x0] +; SME-NEXT: add sp, sp, #64 +; SME-NEXT: ret %load = load <16 x bfloat>, ptr %addr %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> @@ -112,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 { ret void } -attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" } +define void @dupq_f32_256b_with_poison(ptr %addr) #0 { +; CHECK-LABEL: dupq_f32_256b_with_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.s, z0.s[3] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <8 x float>, ptr %addr + %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> + store <8 x float> %splat.lanes, ptr %addr + ret void +} + +attributes #0 = { noinline vscale_range(2,2) }