From a42ffba3f261fd2e57cf744097875b4b27aa877b Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 19 Jun 2025 15:29:48 +0000 Subject: [PATCH 01/11] Test precommit --- .../segmented-shufflevector-patterns.ll | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll new file mode 100644 index 0000000000000..466b2990a548b --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx]) +define void @dup_within_each_segment() #0 { +; CHECK-LABEL: 'dup_within_each_segment' +; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> + ret void +} + +attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" } From e160c70b4504122e47581b6834f530bfbe599e45 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 19 Jun 2025 15:31:23 +0000 Subject: [PATCH 02/11] Return lower cost for dupq --- .../AArch64/AArch64TargetTransformInfo.cpp | 20 +++++++++++++++++++ .../segmented-shufflevector-patterns.ll | 10 +++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 15e38e6cb2408..7522f5773fc58 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5599,6 +5599,26 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, SrcTy = DstTy; } + // Segmented shuffle matching. + if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput && + Kind == TTI::SK_PermuteSingleSrc && isa(Tp) && + Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) { + + FixedVectorType *VTy = cast(Tp); + unsigned Segments = VTy->getPrimitiveSizeInBits() / 128; + unsigned SegmentElts = VTy->getNumElements() / Segments; + + // dupq zd.t, zn.t[idx] + unsigned Lane = (unsigned)Mask[0]; + if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) { + bool IsDupQ = true; + for (unsigned I = 1; I < Mask.size(); ++I) + IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts); + if (IsDupQ) + return LT.first; + } + } + // Check for broadcast loads, which are supported by the LD1R instruction. // In terms of code-size, the shuffle vector is free when a load + dup get // folded into a LD1R. That's what we check and return here. For performance diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll index 466b2990a548b..e6a57d1687254 100644 --- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -4,11 +4,11 @@ ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx]) define void @dup_within_each_segment() #0 { ; CHECK-LABEL: 'dup_within_each_segment' -; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> Date: Fri, 20 Jun 2025 15:27:46 +0000 Subject: [PATCH 03/11] * Refactor to share isDUPQMask * Support SME2p1 * Remove hardcoded magic number * Return the same result for other cost kinds --- .../Target/AArch64/AArch64ISelLowering.cpp | 34 +++++-------------- .../Target/AArch64/AArch64PerfectShuffle.h | 24 +++++++++++++ .../AArch64/AArch64TargetTransformInfo.cpp | 18 ++++------ 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1f98d69edb473..0387721087ce3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13402,30 +13402,6 @@ static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { return true; } -/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in -/// the first vector operand. -static std::optional isDUPQMask(ArrayRef M, EVT VT) { - assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size"); - unsigned Lane = (unsigned)M[0]; - unsigned Segments = VT.getFixedSizeInBits() / 128; - unsigned SegmentElts = VT.getVectorNumElements() / Segments; - - // Make sure there's no size changes. - if (SegmentElts * Segments != M.size()) - return std::nullopt; - - // Check the first index corresponds to one of the lanes in the first segment. - if (Lane >= SegmentElts) - return std::nullopt; - - // Check that all lanes match the first, adjusted for segment. - for (unsigned I = 0; I < M.size(); ++I) - if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts))) - return std::nullopt; - - return Lane; -} - /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. @@ -30026,8 +30002,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } - if (Subtarget->hasSVE2p1()) { - if (std::optional Lane = isDUPQMask(ShuffleMask, VT)) { + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) { + assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && + "Unsupported SVE vector size"); + + unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock; + unsigned SegmentElts = VT.getVectorNumElements() / Segments; + if (std::optional Lane = + isDUPQMask(ShuffleMask, Segments, SegmentElts)) { SDValue IID = DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); return convertFromScalableVector( diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 7b044cf7c238f..01d8fbc705a5f 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" namespace llvm { @@ -6723,6 +6724,29 @@ inline bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, return true; } +/// isDUPQMask - matches a splat of equivalent lanes within segments of a given +/// number of elements. +inline std::optional isDUPQMask(ArrayRef M, unsigned Segments, + unsigned NumElts) { + unsigned Lane = (unsigned)M[0]; + + // Make sure there's no size changes. + if (NumElts * Segments != M.size()) + return std::nullopt; + + // Check the first index corresponds to one of the lanes in the first segment. + if (Lane >= NumElts) + return std::nullopt; + + // Check that all lanes match the first, adjusted for segment. + if (all_of(enumerate(M), [&](auto P) { + return (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts; + })) + return Lane; + + return std::nullopt; +} + } // namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7522f5773fc58..cf37069337723 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,23 +5600,19 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput && + if ((ST->hasSVE2p1() || ST->hasSME2p1()) && Kind == TTI::SK_PermuteSingleSrc && isa(Tp) && - Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) { + Tp->getPrimitiveSizeInBits().isKnownMultipleOf( + AArch64::SVEBitsPerBlock)) { FixedVectorType *VTy = cast(Tp); - unsigned Segments = VTy->getPrimitiveSizeInBits() / 128; + unsigned Segments = + VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock; unsigned SegmentElts = VTy->getNumElements() / Segments; // dupq zd.t, zn.t[idx] - unsigned Lane = (unsigned)Mask[0]; - if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) { - bool IsDupQ = true; - for (unsigned I = 1; I < Mask.size(); ++I) - IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts); - if (IsDupQ) - return LT.first; - } + if (isDUPQMask(Mask, Segments, SegmentElts)) + return LT.first; } // Check for broadcast loads, which are supported by the LD1R instruction. From e72d339cbbde782aed30cb02190f1a84642d0f1c Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 23 Jun 2025 09:49:39 +0000 Subject: [PATCH 04/11] Improve SME check, add runline to test for it --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++- .../CostModel/AArch64/segmented-shufflevector-patterns.ll | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index cf37069337723..110f3cdbe01b0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,7 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if ((ST->hasSVE2p1() || ST->hasSME2p1()) && + if ((ST->hasSVE2p1() || + (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && Kind == TTI::SK_PermuteSingleSrc && isa(Tp) && Tp->getPrimitiveSizeInBits().isKnownMultipleOf( AArch64::SVEBitsPerBlock)) { diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll index e6a57d1687254..6fb11e1bdac17 100644 --- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx]) define void @dup_within_each_segment() #0 { @@ -22,4 +23,4 @@ define void @dup_within_each_segment() #0 { ret void } -attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" } +attributes #0 = { noinline vscale_range(2,2) } From a03c040d70bf0480d960c8cfe4f9d80589544e3e Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 23 Jun 2025 10:15:12 +0000 Subject: [PATCH 05/11] Update ISel and codegen test too --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 +- .../CodeGen/AArch64/sve2p1-vector-shuffles.ll | 49 +++++++++++++++---- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0387721087ce3..3f39e982c4c16 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30002,7 +30002,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } - if (Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) { + if (Subtarget->hasSVE2p1() || + (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) { assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && "Unsupported SVE vector size"); diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll index 40d4d0ff60148..3fe087044332e 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME define void @dupq_i8_256b(ptr %addr) #0 { ; CHECK-LABEL: dupq_i8_256b: @@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 { } define void @dupq_bf16_256b(ptr %addr) #0 { -; CHECK-LABEL: dupq_bf16_256b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: dup v0.8h, v0.h[2] -; CHECK-NEXT: dup v1.8h, v1.h[2] -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: dupq_bf16_256b: +; SVE: // %bb.0: +; SVE-NEXT: ldp q0, q1, [x0] +; SVE-NEXT: dup v0.8h, v0.h[2] +; SVE-NEXT: dup v1.8h, v1.h[2] +; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: ret +; +; SME-LABEL: dupq_bf16_256b: +; SME: // %bb.0: +; SME-NEXT: ldp q1, q0, [x0] +; SME-NEXT: str q0, [sp, #-64]! +; SME-NEXT: .cfi_def_cfa_offset 64 +; SME-NEXT: ldr h0, [sp, #4] +; SME-NEXT: str q1, [sp, #32] +; SME-NEXT: str h0, [sp, #30] +; SME-NEXT: str h0, [sp, #28] +; SME-NEXT: str h0, [sp, #26] +; SME-NEXT: str h0, [sp, #24] +; SME-NEXT: str h0, [sp, #22] +; SME-NEXT: str h0, [sp, #20] +; SME-NEXT: str h0, [sp, #18] +; SME-NEXT: str h0, [sp, #16] +; SME-NEXT: ldr h0, [sp, #36] +; SME-NEXT: ldr q1, [sp, #16] +; SME-NEXT: str h0, [sp, #62] +; SME-NEXT: str h0, [sp, #60] +; SME-NEXT: str h0, [sp, #58] +; SME-NEXT: str h0, [sp, #56] +; SME-NEXT: str h0, [sp, #54] +; SME-NEXT: str h0, [sp, #52] +; SME-NEXT: str h0, [sp, #50] +; SME-NEXT: str h0, [sp, #48] +; SME-NEXT: ldr q0, [sp, #48] +; SME-NEXT: stp q0, q1, [x0] +; SME-NEXT: add sp, sp, #64 +; SME-NEXT: ret %load = load <16 x bfloat>, ptr %addr %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> @@ -112,4 +143,4 @@ define void @dupq_f64_256b(ptr %addr) #0 { ret void } -attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" } +attributes #0 = { noinline vscale_range(2,2) } From 6899d1cc7e3a204b79e7371d3928a3eb0e1b40e5 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 23 Jun 2025 12:31:50 +0000 Subject: [PATCH 06/11] Rebase, getShuffleCost params changed --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 110f3cdbe01b0..b0b76faa803ae 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5602,11 +5602,11 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, // Segmented shuffle matching. if ((ST->hasSVE2p1() || (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && - Kind == TTI::SK_PermuteSingleSrc && isa(Tp) && - Tp->getPrimitiveSizeInBits().isKnownMultipleOf( + Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && + SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( AArch64::SVEBitsPerBlock)) { - FixedVectorType *VTy = cast(Tp); + FixedVectorType *VTy = cast(SrcTy); unsigned Segments = VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock; unsigned SegmentElts = VTy->getNumElements() / Segments; From 57aaebe4d673e134156f9940c066c88968366fbc Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 23 Jun 2025 12:50:04 +0000 Subject: [PATCH 07/11] Only check for isStreaming() --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3f39e982c4c16..e1c2e1144b51c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30003,7 +30003,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( } if (Subtarget->hasSVE2p1() || - (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) { + (Subtarget->hasSME2p1() && Subtarget->isStreaming())) { assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && "Unsupported SVE vector size"); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b0b76faa803ae..a010c8c378a11 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,8 +5600,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if ((ST->hasSVE2p1() || - (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && + if ((ST->hasSVE2p1() || (ST->hasSME2p1() && ST->isStreaming())) && Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( AArch64::SVEBitsPerBlock)) { From 3a7c14ca2331d4b0eb64ab1ba682e65c0d9cb219 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 23 Jun 2025 13:20:00 +0000 Subject: [PATCH 08/11] Revert "Only check for isStreaming()" This reverts commit 57aaebe4d673e134156f9940c066c88968366fbc. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e1c2e1144b51c..3f39e982c4c16 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30003,7 +30003,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( } if (Subtarget->hasSVE2p1() || - (Subtarget->hasSME2p1() && Subtarget->isStreaming())) { + (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) { assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && "Unsupported SVE vector size"); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a010c8c378a11..b0b76faa803ae 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,7 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if ((ST->hasSVE2p1() || (ST->hasSME2p1() && ST->isStreaming())) && + if ((ST->hasSVE2p1() || + (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( AArch64::SVEBitsPerBlock)) { From f45e0b21f2588b134e4d2bd3bf2a6333b3b2d456 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 24 Jun 2025 09:20:14 +0000 Subject: [PATCH 09/11] * Check for mask being empty * Handle poison lanes --- llvm/lib/Target/AArch64/AArch64PerfectShuffle.h | 4 +++- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +- .../AArch64/segmented-shufflevector-patterns.ll | 3 +++ .../test/CodeGen/AArch64/sve2p1-vector-shuffles.ll | 14 ++++++++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 01d8fbc705a5f..cd79251610fe1 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6739,8 +6739,10 @@ inline std::optional isDUPQMask(ArrayRef M, unsigned Segments, return std::nullopt; // Check that all lanes match the first, adjusted for segment. + // Undef/poison lanes (<0) are also accepted. if (all_of(enumerate(M), [&](auto P) { - return (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts; + return P.value() < 0 || + (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts; })) return Lane; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b0b76faa803ae..e00519f0faf5c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5604,7 +5604,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( - AArch64::SVEBitsPerBlock)) { + AArch64::SVEBitsPerBlock) && !Mask.empty()) { FixedVectorType *VTy = cast(SrcTy); unsigned Segments = diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll index 6fb11e1bdac17..72dca2d9ab1da 100644 --- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -10,6 +10,7 @@ define void @dup_within_each_segment() #0 { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> + %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> ret void } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll index 3fe087044332e..da83b27ce4d55 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll @@ -143,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 { ret void } +define void @dupq_f32_256b_with_poison(ptr %addr) #0 { +; CHECK-LABEL: dupq_f32_256b_with_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.s, z0.s[3] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <8 x float>, ptr %addr + %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> + store <8 x float> %splat.lanes, ptr %addr + ret void +} + attributes #0 = { noinline vscale_range(2,2) } From 0c1cdff8a7813d74df3b32e65c7650486acb81fa Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 24 Jun 2025 10:09:44 +0000 Subject: [PATCH 10/11] Make isDUPQMask clearer, add 512b function to cost test --- .../Target/AArch64/AArch64PerfectShuffle.h | 15 +++++----- .../AArch64/AArch64TargetTransformInfo.cpp | 3 +- .../segmented-shufflevector-patterns.ll | 28 +++++++++++++++++-- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index cd79251610fe1..e9bc6d947b0d9 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6726,23 +6726,24 @@ inline bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, /// isDUPQMask - matches a splat of equivalent lanes within segments of a given /// number of elements. -inline std::optional isDUPQMask(ArrayRef M, unsigned Segments, - unsigned NumElts) { - unsigned Lane = (unsigned)M[0]; +inline std::optional isDUPQMask(ArrayRef Mask, unsigned Segments, + unsigned SegmentSize) { + unsigned Lane = unsigned(Mask[0]); // Make sure there's no size changes. - if (NumElts * Segments != M.size()) + if (SegmentSize * Segments != Mask.size()) return std::nullopt; // Check the first index corresponds to one of the lanes in the first segment. - if (Lane >= NumElts) + if (Lane >= SegmentSize) return std::nullopt; // Check that all lanes match the first, adjusted for segment. // Undef/poison lanes (<0) are also accepted. - if (all_of(enumerate(M), [&](auto P) { + if (all_of(enumerate(Mask), [&](auto P) { + const unsigned SegmentIndex = P.index() / SegmentSize; return P.value() < 0 || - (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts; + unsigned(P.value()) == Lane + SegmentIndex * SegmentSize; })) return Lane; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e00519f0faf5c..1ec223068722c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5604,7 +5604,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( - AArch64::SVEBitsPerBlock) && !Mask.empty()) { + AArch64::SVEBitsPerBlock) && + !Mask.empty()) { FixedVectorType *VTy = cast(SrcTy); unsigned Segments = diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll index 72dca2d9ab1da..790f49f1d3b82 100644 --- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -3,8 +3,8 @@ ; RUN: opt -passes="print" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx]) -define void @dup_within_each_segment() #0 { -; CHECK-LABEL: 'dup_within_each_segment' +define void @dup_within_each_segment_256b() #0 { +; CHECK-LABEL: 'dup_within_each_segment_256b' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> @@ -26,4 +26,28 @@ define void @dup_within_each_segment() #0 { ret void } +define void @dup_within_each_segment_512b() #1 { +; CHECK-LABEL: 'dup_within_each_segment_512b' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> + %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + ret void +} + attributes #0 = { noinline vscale_range(2,2) } +attributes #1 = { noinline vscale_range(4,4) } From 80b073c7992d23d135c9ab733f67062c48b75505 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 24 Jun 2025 11:14:47 +0000 Subject: [PATCH 11/11] Correction to feature checking --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++-- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3f39e982c4c16..bfbcc14baf18c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30002,8 +30002,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } - if (Subtarget->hasSVE2p1() || - (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) { + if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) && + Subtarget->isSVEorStreamingSVEAvailable()) { assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 && "Unsupported SVE vector size"); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1ec223068722c..3387dee8aa4c8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,12 +5600,11 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if ((ST->hasSVE2p1() || - (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) && - Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && + if ((ST->hasSVE2p1() || ST->hasSME2p1()) && + ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc && + isa(SrcTy) && !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( - AArch64::SVEBitsPerBlock) && - !Mask.empty()) { + AArch64::SVEBitsPerBlock)) { FixedVectorType *VTy = cast(SrcTy); unsigned Segments =