diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1f98d69edb473..bfbcc14baf18c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13402,30 +13402,6 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
-/// the first vector operand.
-static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
-  assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
-  unsigned Lane = (unsigned)M[0];
-  unsigned Segments = VT.getFixedSizeInBits() / 128;
-  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
-
-  // Make sure there's no size changes.
-  if (SegmentElts * Segments != M.size())
-    return std::nullopt;
-
-  // Check the first index corresponds to one of the lanes in the first segment.
-  if (Lane >= SegmentElts)
-    return std::nullopt;
-
-  // Check that all lanes match the first, adjusted for segment.
-  for (unsigned I = 0; I < M.size(); ++I)
-    if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
-      return std::nullopt;
-
-  return Lane;
-}
-
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30026,8 +30002,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1()) {
-      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+    if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
+        Subtarget->isSVEorStreamingSVEAvailable()) {
+      assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
+             "Unsupported SVE vector size");
+
+      unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
+      unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+      if (std::optional<unsigned> Lane =
+              isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
         SDValue IID =
             DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238f..e9bc6d947b0d9 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -6723,6 +6724,32 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within segments of a given
+///              number of elements.
+inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
+                                          unsigned SegmentSize) {
+  unsigned Lane = unsigned(Mask[0]);
+
+  // Make sure there's no size changes.
+  if (SegmentSize * Segments != Mask.size())
+    return std::nullopt;
+
+  // Check the first index corresponds to one of the lanes in the first segment.
+  if (Lane >= SegmentSize)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  // Undef/poison lanes (<0) are also accepted.
+  if (all_of(enumerate(Mask), [&](auto P) {
+        const unsigned SegmentIndex = P.index() / SegmentSize;
+        return P.value() < 0 ||
+               unsigned(P.value()) == Lane + SegmentIndex * SegmentSize;
+      }))
+    return Lane;
+
+  return std::nullopt;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 15e38e6cb2408..3387dee8aa4c8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5599,6 +5599,23 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     SrcTy = DstTy;
   }
 
+  // Segmented shuffle matching.
+  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
+      ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
+      isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
+      SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
+          AArch64::SVEBitsPerBlock)) {
+
+    FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
+    unsigned Segments =
+        VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
+    unsigned SegmentElts = VTy->getNumElements() / Segments;
+
+    // dupq zd.t, zn.t[idx]
+    if (isDUPQMask(Mask, Segments, SegmentElts))
+      return LT.first;
+  }
+
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
   // folded into a LD1R. That's what we check and return here. For performance
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
new file mode 100644
index 0000000000000..790f49f1d3b82
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
+
+;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
+define void @dup_within_each_segment_256b() #0 {
+; CHECK-LABEL: 'dup_within_each_segment_256b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
+  ret void
+}
+
+define void @dup_within_each_segment_512b() #1 {
+; CHECK-LABEL: 'dup_within_each_segment_512b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) }
+attributes #1 = { noinline vscale_range(4,4) }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 40d4d0ff60148..da83b27ce4d55 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
 
 define void @dupq_i8_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i8_256b:
@@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 {
 }
 
 define void @dupq_bf16_256b(ptr %addr) #0 {
-; CHECK-LABEL: dupq_bf16_256b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    dup v0.8h, v0.h[2]
-; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    ret
+; SVE-LABEL: dupq_bf16_256b:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ldp q0, q1, [x0]
+; SVE-NEXT:    dup v0.8h, v0.h[2]
+; SVE-NEXT:    dup v1.8h, v1.h[2]
+; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    ret
+;
+; SME-LABEL: dupq_bf16_256b:
+; SME:       // %bb.0:
+; SME-NEXT:    ldp q1, q0, [x0]
+; SME-NEXT:    str q0, [sp, #-64]!
+; SME-NEXT:    .cfi_def_cfa_offset 64
+; SME-NEXT:    ldr h0, [sp, #4]
+; SME-NEXT:    str q1, [sp, #32]
+; SME-NEXT:    str h0, [sp, #30]
+; SME-NEXT:    str h0, [sp, #28]
+; SME-NEXT:    str h0, [sp, #26]
+; SME-NEXT:    str h0, [sp, #24]
+; SME-NEXT:    str h0, [sp, #22]
+; SME-NEXT:    str h0, [sp, #20]
+; SME-NEXT:    str h0, [sp, #18]
+; SME-NEXT:    str h0, [sp, #16]
+; SME-NEXT:    ldr h0, [sp, #36]
+; SME-NEXT:    ldr q1, [sp, #16]
+; SME-NEXT:    str h0, [sp, #62]
+; SME-NEXT:    str h0, [sp, #60]
+; SME-NEXT:    str h0, [sp, #58]
+; SME-NEXT:    str h0, [sp, #56]
+; SME-NEXT:    str h0, [sp, #54]
+; SME-NEXT:    str h0, [sp, #52]
+; SME-NEXT:    str h0, [sp, #50]
+; SME-NEXT:    str h0, [sp, #48]
+; SME-NEXT:    ldr q0, [sp, #48]
+; SME-NEXT:    stp q0, q1, [x0]
+; SME-NEXT:    add sp, sp, #64
+; SME-NEXT:    ret
   %load = load <16 x bfloat>, ptr %addr
   %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
                                                                                       i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -112,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 {
   ret void
 }
 
-attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
+define void @dupq_f32_256b_with_poison(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f32_256b_with_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x float>, ptr %addr
+  %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 3, i32 3,
+                                                                                 i32 7, i32 7, i32 7, i32 poison>
+  store <8 x float> %splat.lanes, ptr %addr
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) }