From e899c47c8748bc516f8b0964847f2f1438f5f120 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 8 Oct 2024 14:53:36 +0100
Subject: [PATCH 01/14] Add support for single reductions in
 ComplexDeinterleavingPass

---
 .../llvm/CodeGen/ComplexDeinterleavingPass.h  |   1 +
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 121 +++++++++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  19 +-
 .../AArch64/complex-deinterleaving-cdot.ll    | 170 ++++++++++++++++++
 4 files changed, 288 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 84a2673fecb5b..a3fa219772770 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -43,6 +43,7 @@ enum class ComplexDeinterleavingOperation {
   ReductionPHI,
   ReductionOperation,
   ReductionSelect,
+  ReductionSingle
 };
 
 enum class ComplexDeinterleavingRotation {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8573b016d1e5b..08287a4d5ed02 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -145,6 +145,7 @@ struct ComplexDeinterleavingCompositeNode {
   friend class ComplexDeinterleavingGraph;
   using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
   using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+  bool OperandsValid = true;
 
 public:
   ComplexDeinterleavingOperation Operation;
@@ -161,7 +162,11 @@ struct ComplexDeinterleavingCompositeNode {
   SmallVector<RawNodePtr> Operands;
   Value *ReplacementNode = nullptr;
 
-  void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+  void addOperand(NodePtr Node) {
+    if (!Node || !Node.get())
+      OperandsValid = false;
+    Operands.push_back(Node.get());
+  }
 
   void dump() { dump(dbgs()); }
   void dump(raw_ostream &OS) {
@@ -195,6 +200,10 @@ struct ComplexDeinterleavingCompositeNode {
       PrintNodeRef(Op);
     }
   }
+
+  bool AreOperandsValid() {
+    return OperandsValid;
+  }
 };
 
 class ComplexDeinterleavingGraph {
@@ -294,7 +303,7 @@ class ComplexDeinterleavingGraph {
 
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
-    if (Node->Real && Node->Imag)
+    if (Node->Real)
       CachedResult[{Node->Real, Node->Imag}] = Node;
     return Node;
   }
@@ -328,8 +337,10 @@ class ComplexDeinterleavingGraph {
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
   NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifyPartialReduction(Value *R, Value *I);
 
   NodePtr identifyNode(Value *R, Value *I);
+  NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
 
   /// Determine if a sum of complex numbers can be formed from \p RealAddends
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -397,6 +408,7 @@ class ComplexDeinterleavingGraph {
   /// * Deinterleave the final value outside of the loop and repurpose original
   /// reduction users
   void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
+  void processReductionSingle(Value *OperationReplacement, RawNodePtr Node);
 
 public:
   void dump() { dump(dbgs()); }
@@ -893,16 +905,26 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
-  assert(R->getType() == I->getType() &&
-         "Real and imaginary parts should not have different types");
+  bool _;
+  return identifyNode(R, I, _);
+}
 
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
   auto It = CachedResult.find({R, I});
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
+    FromCache = true;
     return It->second;
   }
 
+  if(NodePtr CN = identifyPartialReduction(R, I))
+    return CN;
+
+  bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+  if(!IsReduction && R->getType() != I->getType())
+    return nullptr;
+
   if (NodePtr CN = identifySplat(R, I))
     return CN;
 
@@ -1428,12 +1450,18 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
   if (It != RootToNode.end()) {
     auto RootNode = It->second;
     assert(RootNode->Operation ==
-           ComplexDeinterleavingOperation::ReductionOperation);
+           ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle);
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
     auto *R = cast<Instruction>(RootNode->Real);
-    auto *I = cast<Instruction>(RootNode->Imag);
-    auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+    auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+
+    Instruction *ReplacementAnchor;
+    if(I) 
+      ReplacementAnchor = R->comesBefore(I) ? I : R;
+    else 
+      ReplacementAnchor = R;
+    
     if (ReplacementAnchor != RootI)
       return false;
     OrderedRoots.push_back(RootI);
@@ -1521,11 +1549,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
   for (size_t i = 0; i < OperationInstruction.size(); ++i) {
     if (Processed[i])
       continue;
+    auto *Real = OperationInstruction[i];
     for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
       if (Processed[j])
         continue;
-
-      auto *Real = OperationInstruction[i];
+      
       auto *Imag = OperationInstruction[j];
       if (Real->getType() != Imag->getType())
         continue;
@@ -1557,6 +1585,25 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
         break;
       }
     }
+
+    // We want to check that we have 2 operands, but the function attributes
+    // being counted as operands bloats this value.
+    if(Real->getNumOperands() < 2)
+      continue;
+
+    RealPHI = ReductionInfo[Real].first;
+    ImagPHI = nullptr;
+    PHIsFound = false;
+    auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
+    if(Node && PHIsFound) {
+      LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: "
+                          << *Real << "/" << *ReductionInfo[Real].second << "\n");
+      Processed[i] = true;
+      auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+      RootNode->addOperand(Node);
+      RootToNode[Real] = RootNode;
+      submitCompositeNode(RootNode);
+    }
   }
 
   RealPHI = nullptr;
@@ -1564,6 +1611,12 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
+
+  for (NodePtr N : CompositeNodes) {
+    if (!N->AreOperandsValid())
+      return false;
+  }
+
   // Collect all instructions from roots to leaves
   SmallPtrSet<Instruction *, 16> AllInstructions;
   SmallVector<Instruction *, 8> Worklist;
@@ -1832,7 +1885,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
                                             Instruction *Imag) {
-  if (Real != RealPHI || Imag != ImagPHI)
+  if (Real != RealPHI || (ImagPHI && Imag != ImagPHI))
     return nullptr;
 
   PHIsFound = true;
@@ -1970,13 +2023,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   case ComplexDeinterleavingOperation::ReductionPHI: {
     // If Operation is ReductionPHI, a new empty PHINode is created.
     // It is filled later when the ReductionOperation is processed.
+    auto *OldPHI = cast<PHINode>(Node->Real);
     auto *VTy = cast<VectorType>(Node->Real->getType());
     auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
     auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
-    OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
+    OldToNewPHI[OldPHI] = NewPHI;
     ReplacementNode = NewPHI;
     break;
   }
+  case ComplexDeinterleavingOperation::ReductionSingle:
+    ReplacementNode = replaceNode(Builder, Node->Operands[0]);
+    processReductionSingle(ReplacementNode, Node);
+    break;
   case ComplexDeinterleavingOperation::ReductionOperation:
     ReplacementNode = replaceNode(Builder, Node->Operands[0]);
     processReductionOperation(ReplacementNode, Node);
@@ -2001,6 +2059,37 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   return ReplacementNode;
 }
 
+void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) {
+  auto *Real = cast<Instruction>(Node->Real);
+  auto *OldPHI = ReductionInfo[Real].first;
+  auto *NewPHI = OldToNewPHI[OldPHI];
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+  Value *Init = OldPHI->getIncomingValueForBlock(Incoming);
+
+  IRBuilder<> Builder(Incoming->getTerminator());
+
+  Value *NewInit = nullptr;
+  if(auto *C = dyn_cast<Constant>(Init)) {
+    if(C->isZeroValue())
+      NewInit = Constant::getNullValue(NewVTy);
+  }
+
+  if (!NewInit)
+    NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                          {Init, Constant::getNullValue(VTy)});
+
+  NewPHI->addIncoming(NewInit, Incoming);
+  NewPHI->addIncoming(OperationReplacement, BackEdge);
+
+  auto *FinalReduction = ReductionInfo[Real].second;
+  Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
+  // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction`
+  auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+  FinalReduction->replaceAllUsesWith(AddReduce);
+}
+
 void ComplexDeinterleavingGraph::processReductionOperation(
     Value *OperationReplacement, RawNodePtr Node) {
   auto *Real = cast<Instruction>(Node->Real);
@@ -2060,8 +2149,12 @@ void ComplexDeinterleavingGraph::replaceNodes() {
       auto *RootImag = cast<Instruction>(RootNode->Imag);
       ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
-      DeadInstrRoots.push_back(cast<Instruction>(RootReal));
-      DeadInstrRoots.push_back(cast<Instruction>(RootImag));
+      DeadInstrRoots.push_back(RootReal);
+      DeadInstrRoots.push_back(RootImag);
+    } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) {
+      auto *RootInst = cast<Instruction>(RootNode->Real);
+      ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
+      DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
     } else {
       assert(R && "Unable to find replacement for RootInstruction");
       DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7448416c682ab..7c3c32643ed64 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29217,6 +29217,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
   VectorType *Ty = cast<VectorType>(InputA->getType());
+  if (Accumulator == nullptr)
+    Accumulator = Constant::getNullValue(Ty);
   bool IsScalable = Ty->isScalableTy();
   bool IsInt = Ty->getElementType()->isIntegerTy();
 
@@ -29228,6 +29230,7 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
 
   if (TyWidth > 128) {
     int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+    int AccStride = cast<VectorType>(Accumulator->getType())->getElementCount().getKnownMinValue() / 2;
     auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
     auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
     auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29237,25 +29240,23 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
         B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
     Value *LowerSplitAcc = nullptr;
     Value *UpperSplitAcc = nullptr;
-    if (Accumulator) {
-      LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
+    Type *FullTy = Ty;
+      FullTy = Accumulator->getType();
+      auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast<VectorType>(Accumulator->getType()));
+      LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
       UpperSplitAcc =
-          B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
-    }
+          B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
     auto *LowerSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
-    auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
+    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt,
                                         B.getInt64(0));
-    return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
+    return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride));
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
-    if (Accumulator == nullptr)
-      Accumulator = Constant::getNullValue(Ty);
-
     if (IsScalable) {
       if (IsInt)
         return B.CreateIntrinsic(
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 0000000000000..6277f9a3842bb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: define i32 @cdotp(
+; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]]
+; CHECK-NEXT:    [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32
+; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16)
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0)
+; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1
+; CHECK-NEXT:    [[CONV10:%.*]] = sext i8 [[TMP18]] to i32
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]]
+; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]]
+; CHECK-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]]
+; CHECK-NEXT:    [[SUB]] = sub i32 [[ADD17]], [[MUL18]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader
+for.body.preheader:                               ; preds = %entry
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  %0 = call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, 16
+  %min.iters.check = icmp ult i64 %wide.trip.count, %1
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+vector.ph:                                        ; preds = %for.body.preheader
+  %2 = call i64 @llvm.vscale.i64()
+  %3 = mul i64 %2, 16
+  %n.mod.vf = urem i64 %wide.trip.count, %3
+  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = mul i64 %4, 16
+  br label %vector.body
+vector.body:
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ]
+  %index.i = shl nuw nsw i64 %index, 1
+  %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i
+  %a.load = load <vscale x 32 x i8>, ptr %a.ptr
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i
+  %b.load = load <vscale x 32 x i8>, ptr %b.ptr
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load)
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  %index.next = add nuw i64 %index, %5
+  %22 = icmp eq i64 %index.next, %n.vec
+  br i1 %22, label %middle.block, label %vector.body
+middle.block:                                     ; preds = %vector.body
+  %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
+scalar.ph:                                        ; preds = %middle.block, %for.body.preheader
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
+  %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ]
+  br label %for.body
+for.cond.cleanup.loopexit:                        ; preds = %middle.block, %for.body
+  %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ]
+  br label %for.cond.cleanup
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %res.0.lcssa
+for.body:                                         ; preds = %scalar.ph, %for.body
+  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+  %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ]
+  %26 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26
+  %27 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %27 to i32
+  %28 = or disjoint i64 %26, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28
+  %29 = load i8, ptr %arrayidx4, align 1
+  %conv5 = sext i8 %29 to i32
+  %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26
+  %30 = load i8, ptr %arrayidx9, align 1
+  %conv10 = sext i8 %30 to i32
+  %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28
+  %31 = load i8, ptr %arrayidx14, align 1
+  %conv15 = sext i8 %31 to i32
+  %mul16 = mul nsw i32 %conv10, %conv
+  %add17 = add nsw i32 %mul16, %res.030
+  %mul18 = mul nsw i32 %conv15, %conv5
+  %sub = sub i32 %add17, %mul18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

From fb22e229f48e6008a798aeff44d0ec56ab157e49 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Fri, 18 Oct 2024 11:14:49 +0100
Subject: [PATCH 02/14] Apply clang-format

---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 46 ++++++++++---------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 24 ++++++----
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 08287a4d5ed02..3a5436714715b 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -201,9 +201,7 @@ struct ComplexDeinterleavingCompositeNode {
     }
   }
 
-  bool AreOperandsValid() {
-    return OperandsValid;
-  }
+  bool AreOperandsValid() { return OperandsValid; }
 };
 
 class ComplexDeinterleavingGraph {
@@ -918,11 +916,11 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
     return It->second;
   }
 
-  if(NodePtr CN = identifyPartialReduction(R, I))
+  if (NodePtr CN = identifyPartialReduction(R, I))
     return CN;
 
   bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
-  if(!IsReduction && R->getType() != I->getType())
+  if (!IsReduction && R->getType() != I->getType())
     return nullptr;
 
   if (NodePtr CN = identifySplat(R, I))
@@ -1450,18 +1448,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
   if (It != RootToNode.end()) {
     auto RootNode = It->second;
     assert(RootNode->Operation ==
-           ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle);
+               ComplexDeinterleavingOperation::ReductionOperation ||
+           RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle);
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
     auto *R = cast<Instruction>(RootNode->Real);
     auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
 
     Instruction *ReplacementAnchor;
-    if(I) 
+    if (I)
       ReplacementAnchor = R->comesBefore(I) ? I : R;
-    else 
+    else
       ReplacementAnchor = R;
-    
+
     if (ReplacementAnchor != RootI)
       return false;
     OrderedRoots.push_back(RootI);
@@ -1553,7 +1553,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
     for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
       if (Processed[j])
         continue;
-      
+
       auto *Imag = OperationInstruction[j];
       if (Real->getType() != Imag->getType())
         continue;
@@ -1588,18 +1588,20 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 
     // We want to check that we have 2 operands, but the function attributes
     // being counted as operands bloats this value.
-    if(Real->getNumOperands() < 2)
+    if (Real->getNumOperands() < 2)
       continue;
 
     RealPHI = ReductionInfo[Real].first;
     ImagPHI = nullptr;
     PHIsFound = false;
     auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
-    if(Node && PHIsFound) {
-      LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: "
-                          << *Real << "/" << *ReductionInfo[Real].second << "\n");
+    if (Node && PHIsFound) {
+      LLVM_DEBUG(
+          dbgs() << "Identified single reduction starting from instruction: "
+                 << *Real << "/" << *ReductionInfo[Real].second << "\n");
       Processed[i] = true;
-      auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+      auto RootNode = prepareCompositeNode(
+          ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
       RootNode->addOperand(Node);
       RootToNode[Real] = RootNode;
       submitCompositeNode(RootNode);
@@ -2059,7 +2061,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   return ReplacementNode;
 }
 
-void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) {
+void ComplexDeinterleavingGraph::processReductionSingle(
+    Value *OperationReplacement, RawNodePtr Node) {
   auto *Real = cast<Instruction>(Node->Real);
   auto *OldPHI = ReductionInfo[Real].first;
   auto *NewPHI = OldToNewPHI[OldPHI];
@@ -2071,21 +2074,21 @@ void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacem
   IRBuilder<> Builder(Incoming->getTerminator());
 
   Value *NewInit = nullptr;
-  if(auto *C = dyn_cast<Constant>(Init)) {
-    if(C->isZeroValue())
+  if (auto *C = dyn_cast<Constant>(Init)) {
+    if (C->isZeroValue())
       NewInit = Constant::getNullValue(NewVTy);
   }
 
   if (!NewInit)
     NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
-                                          {Init, Constant::getNullValue(VTy)});
+                                      {Init, Constant::getNullValue(VTy)});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
 
   auto *FinalReduction = ReductionInfo[Real].second;
   Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
-  // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction`
+
   auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
   FinalReduction->replaceAllUsesWith(AddReduce);
 }
@@ -2151,7 +2154,8 @@ void ComplexDeinterleavingGraph::replaceNodes() {
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(RootReal);
       DeadInstrRoots.push_back(RootImag);
-    } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) {
+    } else if (RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle) {
       auto *RootInst = cast<Instruction>(RootNode->Real);
       ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c3c32643ed64..869e4e48427e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29230,7 +29230,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
 
   if (TyWidth > 128) {
     int Stride = Ty->getElementCount().getKnownMinValue() / 2;
-    int AccStride = cast<VectorType>(Accumulator->getType())->getElementCount().getKnownMinValue() / 2;
+    int AccStride = cast<VectorType>(Accumulator->getType())
+                        ->getElementCount()
+                        .getKnownMinValue() /
+                    2;
     auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
     auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
     auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29241,19 +29244,22 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     Value *LowerSplitAcc = nullptr;
     Value *UpperSplitAcc = nullptr;
     Type *FullTy = Ty;
-      FullTy = Accumulator->getType();
-      auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast<VectorType>(Accumulator->getType()));
-      LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
-      UpperSplitAcc =
-          B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
+    FullTy = Accumulator->getType();
+    auto *HalfAccTy = VectorType::getHalfElementsVectorType(
+        cast<VectorType>(Accumulator->getType()));
+    LowerSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
+    UpperSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
     auto *LowerSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
-    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt,
-                                        B.getInt64(0));
-    return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride));
+    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
+                                        LowerSplitInt, B.getInt64(0));
+    return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
+                                B.getInt64(AccStride));
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {

From 42fba2826094bd5cd6ec2d8f82c8588b76ea3bb9 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Fri, 18 Oct 2024 13:14:33 +0100
Subject: [PATCH 03/14] Remove erroneously added function and call

---
 llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 3a5436714715b..18ad74aa9bae1 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -335,7 +335,6 @@ class ComplexDeinterleavingGraph {
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
   NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
-  NodePtr identifyPartialReduction(Value *R, Value *I);
 
   NodePtr identifyNode(Value *R, Value *I);
   NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
@@ -916,8 +915,6 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
     return It->second;
   }
 
-  if (NodePtr CN = identifyPartialReduction(R, I))
-    return CN;
 
   bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
   if (!IsReduction && R->getType() != I->getType())

From 918312c13b722a45a138cf126a567876efaf22a5 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 22 Oct 2024 14:32:57 +0100
Subject: [PATCH 04/14] Fix case where it fails to identify unrolled reductions

Also removed prematurely-added test
---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp |   4 +-
 .../AArch64/complex-deinterleaving-cdot.ll    | 170 ------------------
 2 files changed, 2 insertions(+), 172 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 18ad74aa9bae1..edad678e4d0c0 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1546,11 +1546,10 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
   for (size_t i = 0; i < OperationInstruction.size(); ++i) {
     if (Processed[i])
       continue;
-    auto *Real = OperationInstruction[i];
     for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
       if (Processed[j])
         continue;
-
+      auto *Real = OperationInstruction[i];
       auto *Imag = OperationInstruction[j];
       if (Real->getType() != Imag->getType())
         continue;
@@ -1583,6 +1582,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
       }
     }
 
+    auto *Real = OperationInstruction[i];
     // We want to check that we have 2 operands, but the function attributes
     // being counted as operands bloats this value.
     if (Real->getNumOperands() < 2)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
deleted file mode 100644
index 6277f9a3842bb..0000000000000
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ /dev/null
@@ -1,170 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
-; CHECK-LABEL: define i32 @cdotp(
-; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT:    br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]]
-; CHECK-NEXT:    [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32
-; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]]
-; CHECK-NEXT:    [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16)
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0)
-; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0)
-; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0)
-; CHECK-NEXT:    [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP15]] to i32
-; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; CHECK-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1
-; CHECK-NEXT:    [[CONV10:%.*]] = sext i8 [[TMP18]] to i32
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1
-; CHECK-NEXT:    [[CONV15:%.*]] = sext i8 [[TMP19]] to i32
-; CHECK-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]]
-; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]]
-; CHECK-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]]
-; CHECK-NEXT:    [[SUB]] = sub i32 [[ADD17]], [[MUL18]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
-;
-entry:
-  %cmp28.not = icmp ult i32 %N, 2
-  br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader
-for.body.preheader:                               ; preds = %entry
-  %div27 = lshr i32 %N, 1
-  %wide.trip.count = zext nneg i32 %div27 to i64
-  %0 = call i64 @llvm.vscale.i64()
-  %1 = mul i64 %0, 16
-  %min.iters.check = icmp ult i64 %wide.trip.count, %1
-  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-vector.ph:                                        ; preds = %for.body.preheader
-  %2 = call i64 @llvm.vscale.i64()
-  %3 = mul i64 %2, 16
-  %n.mod.vf = urem i64 %wide.trip.count, %3
-  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
-  %4 = call i64 @llvm.vscale.i64()
-  %5 = mul i64 %4, 16
-  br label %vector.body
-vector.body:
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ]
-  %index.i = shl nuw nsw i64 %index, 1
-  %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i
-  %a.load = load <vscale x 32 x i8>, ptr %a.ptr
-  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load)
-  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
-  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
-  %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i
-  %b.load = load <vscale x 32 x i8>, ptr %b.ptr
-  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load)
-  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
-  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
-  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
-  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
-  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
-  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
-  %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
-  %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
-  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
-  %index.next = add nuw i64 %index, %5
-  %22 = icmp eq i64 %index.next, %n.vec
-  br i1 %22, label %middle.block, label %vector.body
-middle.block:                                     ; preds = %vector.body
-  %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
-  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
-  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
-scalar.ph:                                        ; preds = %middle.block, %for.body.preheader
-  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
-  %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ]
-  br label %for.body
-for.cond.cleanup.loopexit:                        ; preds = %middle.block, %for.body
-  %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ]
-  br label %for.cond.cleanup
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %res.0.lcssa
-for.body:                                         ; preds = %scalar.ph, %for.body
-  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-  %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ]
-  %26 = shl nuw nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26
-  %27 = load i8, ptr %arrayidx, align 1
-  %conv = sext i8 %27 to i32
-  %28 = or disjoint i64 %26, 1
-  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28
-  %29 = load i8, ptr %arrayidx4, align 1
-  %conv5 = sext i8 %29 to i32
-  %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26
-  %30 = load i8, ptr %arrayidx9, align 1
-  %conv10 = sext i8 %30 to i32
-  %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28
-  %31 = load i8, ptr %arrayidx14, align 1
-  %conv15 = sext i8 %31 to i32
-  %mul16 = mul nsw i32 %conv10, %conv
-  %add17 = add nsw i32 %mul16, %res.030
-  %mul18 = mul nsw i32 %conv15, %conv5
-  %sub = sub i32 %add17, %mul18
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}

From e492fa4691ce844ba2d3b3a3e5ab05130b7b58e6 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 22 Oct 2024 14:45:40 +0100
Subject: [PATCH 05/14] Address formatting errors

---
 llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index edad678e4d0c0..d92dcdcd943b1 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -915,7 +915,6 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
     return It->second;
   }
 
-
   bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
   if (!IsReduction && R->getType() != I->getType())
     return nullptr;

From 838dff4e36055761ecb884cf47e441d49a610bb8 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 8 Oct 2024 14:53:49 +0100
Subject: [PATCH 06/14] Add support for complex dot product operations

---
 .../llvm/CodeGen/ComplexDeinterleavingPass.h  |   1 +
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 164 +++++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |   9 +
 .../AArch64/complex-deinterleaving-cdot.ll    | 170 ++++++++++++++++++
 4 files changed, 344 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll

diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index a3fa219772770..4383249658e60 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass
 enum class ComplexDeinterleavingOperation {
   CAdd,
   CMulPartial,
+  CDot,
   // The following 'operations' are used to represent internal states. Backends
   // are not expected to try and support these in any capacity.
   Deinterleave,
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index d92dcdcd943b1..bfdae30722548 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -335,6 +335,8 @@ class ComplexDeinterleavingGraph {
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
   NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifyPartialReduction(Value *R, Value *I);
+  NodePtr identifyDotProduct(Value *Inst);
 
   NodePtr identifyNode(Value *R, Value *I);
   NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
@@ -900,6 +902,152 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
   return submitCompositeNode(Node);
 }
 
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
+  auto *Inst = cast<Instruction>(V);
+
+  if(!TL->isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation::CDot, Inst->getType())) {
+    LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving operation CDot with the type " << *Inst->getType() << "\n");
+    return nullptr;
+  }
+
+  auto *RealUser = cast<Instruction>(*Inst->user_begin());
+
+  NodePtr CN = prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
+
+  NodePtr ANode;
+
+  const Intrinsic::ID PartialReduceInt = Intrinsic::experimental_vector_partial_reduce_add;
+
+  Value *AReal = nullptr;
+  Value *AImag = nullptr;
+  Value *BReal = nullptr;
+  Value *BImag = nullptr;
+  Value *Phi = nullptr;
+
+  auto UnwrapCast = [](Value *V) -> Value* {
+    if(auto *CI = dyn_cast<CastInst>(V)) 
+      return CI->getOperand(0);
+    return V;
+  };
+  
+  auto PatternRot0 =
+        m_Intrinsic<PartialReduceInt>(
+          m_Intrinsic<PartialReduceInt>(
+            m_Value(Phi),
+            m_Mul(m_Value(BReal), m_Value(AReal))),
+          m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
+
+  auto PatternRot270 =
+        m_Intrinsic<PartialReduceInt>(
+          m_Intrinsic<PartialReduceInt>(
+            m_Value(Phi),
+            m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
+          m_Mul(m_Value(BImag), m_Value(AReal)));
+
+  if(match(Inst, PatternRot0)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_0;
+  }else if(match(Inst, PatternRot270)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  }else {
+    Value *A0, *A1;
+    // The rotations 90 and 180 share the same operation pattern, so inspect the
+    // order of the operands, identifying where the real and imaginary components
+    // of A go, to discern between the aforementioned rotations.
+    auto PatternRot90Rot180 = 
+          m_Intrinsic<PartialReduceInt>(
+            m_Intrinsic<PartialReduceInt>(
+              m_Value(Phi),
+              m_Mul(m_Value(BReal), m_Value(A0))
+            ),
+            m_Mul(m_Value(BImag), m_Value(A1)));
+    
+    if(!match(Inst, PatternRot90Rot180)) 
+      return nullptr;
+    
+    A0 = UnwrapCast(A0);
+    A1 = UnwrapCast(A1);
+
+    // Test if A0 is real/A1 is imag
+    ANode = identifyNode(A0, A1);
+    if(!ANode) {
+      // Test if A0 is imag/A1 is real
+      ANode = identifyNode(A1, A0);
+      // Unable to identify operand components, thus unable to identify rotation
+      if(!ANode)
+        return nullptr;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_90;
+      AReal = A1;
+      AImag = A0;
+    } else {
+      AReal = A0;
+      AImag = A1;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_180;
+    }
+  }
+
+  AReal = UnwrapCast(AReal);
+  AImag = UnwrapCast(AImag);
+  BReal = UnwrapCast(BReal);
+  BImag = UnwrapCast(BImag);
+
+  bool WasANodeFromCache = false;
+  NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache);
+
+  // In the case that a node was identified to figure out the rotation, ensure that trying to
+  // identify a node with AReal and AImag post-unwrap results in the same node
+  if(Node && ANode && !WasANodeFromCache) {
+    LLVM_DEBUG(dbgs() << "Identified node is different from previously identified node. Unable to confidently generate a complex operation node\n");
+    return nullptr;
+  }
+
+  CN->addOperand(Node);
+  CN->addOperand(identifyNode(BReal, BImag));
+  CN->addOperand(identifyNode(Phi, RealUser));
+
+  return submitCompositeNode(CN);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
+  if (!I->hasOneUser())
+    return nullptr;
+
+  VectorType *RealTy = dyn_cast<VectorType>(R->getType());
+  if(!RealTy)
+    return nullptr;
+  VectorType *ImagTy = dyn_cast<VectorType>(I->getType());
+  if(!ImagTy)
+    return nullptr;
+
+  if(RealTy->isScalableTy() != ImagTy->isScalableTy())
+    return nullptr;
+  if(RealTy->getElementType() != ImagTy->getElementType())
+    return nullptr;
+
+  // `I` is known to only have one user, so iterate over the Phi (R) users to find the common user between R and I
+  auto *CommonUser = *I->user_begin();
+  bool CommonUserFound = false;
+  for (auto *User : R->users()) {
+    if (User == CommonUser) {
+      CommonUserFound = true;
+      break;
+    }
+  }
+
+  if (!CommonUserFound)
+    return nullptr;
+
+  auto *IInst = dyn_cast<IntrinsicInst>(CommonUser);
+  if (!IInst || IInst->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
+    return nullptr;
+
+  if(NodePtr CN = identifyDotProduct(IInst))
+    return CN;
+
+  return nullptr;
+}
+
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
   bool _;
@@ -915,6 +1063,9 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
     return It->second;
   }
 
+  if(NodePtr CN = identifyPartialReduction(R, I))
+    return CN;
+
   bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
   if (!IsReduction && R->getType() != I->getType())
     return nullptr;
@@ -1535,6 +1686,7 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
 }
 
 void ComplexDeinterleavingGraph::identifyReductionNodes() {
+  dbgs() << "identifyReductionNodes\n";
   SmallVector<bool> Processed(ReductionInfo.size(), false);
   SmallVector<Instruction *> OperationInstruction;
   for (auto &P : ReductionInfo)
@@ -1590,6 +1742,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
     RealPHI = ReductionInfo[Real].first;
     ImagPHI = nullptr;
     PHIsFound = false;
+    dbgs() << "identifyNode from Phi " << *RealPHI << " / " << *Real << "\n";
     auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
     if (Node && PHIsFound) {
       LLVM_DEBUG(
@@ -1978,6 +2131,17 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
 
   Value *ReplacementNode;
   switch (Node->Operation) {
+  case ComplexDeinterleavingOperation::CDot: {
+    Value *Input0 = ReplaceOperandIfExist(Node, 0);
+    Value *Input1 = ReplaceOperandIfExist(Node, 1);
+    Value *Accumulator = ReplaceOperandIfExist(Node, 2);
+    assert(!Input1 || (Input0->getType() == Input1->getType() &&
+                       "Node inputs need to be of the same type"));
+    ReplacementNode = TL->createComplexDeinterleavingIR(
+        Builder, Node->Operation, Node->Rotation, Input0, Input1,
+        Accumulator);
+    break;
+  }
   case ComplexDeinterleavingOperation::CAdd:
   case ComplexDeinterleavingOperation::CMulPartial:
   case ComplexDeinterleavingOperation::Symmetric: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 869e4e48427e8..a2dcf7f851b55 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29205,6 +29205,9 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
 
   if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
     unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
+
+    if (Operation == ComplexDeinterleavingOperation::CDot)
+      return ScalarWidth == 32 || ScalarWidth == 64;
     return 8 <= ScalarWidth && ScalarWidth <= 64;
   }
 
@@ -29314,6 +29317,12 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
   }
 
+  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && IsScalable) {
+    return B.CreateIntrinsic(
+            Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
+            {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 0000000000000..6277f9a3842bb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: define i32 @cdotp(
+; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]]
+; CHECK-NEXT:    [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32
+; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16)
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0)
+; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1
+; CHECK-NEXT:    [[CONV10:%.*]] = sext i8 [[TMP18]] to i32
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]]
+; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]]
+; CHECK-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]]
+; CHECK-NEXT:    [[SUB]] = sub i32 [[ADD17]], [[MUL18]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader
+for.body.preheader:                               ; preds = %entry
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  %0 = call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, 16
+  %min.iters.check = icmp ult i64 %wide.trip.count, %1
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+vector.ph:                                        ; preds = %for.body.preheader
+  %2 = call i64 @llvm.vscale.i64()
+  %3 = mul i64 %2, 16
+  %n.mod.vf = urem i64 %wide.trip.count, %3
+  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = mul i64 %4, 16
+  br label %vector.body
+vector.body:
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ]
+  %index.i = shl nuw nsw i64 %index, 1
+  %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i
+  %a.load = load <vscale x 32 x i8>, ptr %a.ptr
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i
+  %b.load = load <vscale x 32 x i8>, ptr %b.ptr
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load)
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  %index.next = add nuw i64 %index, %5
+  %22 = icmp eq i64 %index.next, %n.vec
+  br i1 %22, label %middle.block, label %vector.body
+middle.block:                                     ; preds = %vector.body
+  %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
+scalar.ph:                                        ; preds = %middle.block, %for.body.preheader
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
+  %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ]
+  br label %for.body
+for.cond.cleanup.loopexit:                        ; preds = %middle.block, %for.body
+  %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ]
+  br label %for.cond.cleanup
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %res.0.lcssa
+for.body:                                         ; preds = %scalar.ph, %for.body
+  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+  %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ]
+  %26 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26
+  %27 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %27 to i32
+  %28 = or disjoint i64 %26, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28
+  %29 = load i8, ptr %arrayidx4, align 1
+  %conv5 = sext i8 %29 to i32
+  %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26
+  %30 = load i8, ptr %arrayidx9, align 1
+  %conv10 = sext i8 %30 to i32
+  %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28
+  %31 = load i8, ptr %arrayidx14, align 1
+  %conv15 = sext i8 %31 to i32
+  %mul16 = mul nsw i32 %conv10, %conv
+  %add17 = add nsw i32 %mul16, %res.030
+  %mul18 = mul nsw i32 %conv15, %conv5
+  %sub = sub i32 %add17, %mul18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

From b2410688146531936db5f58ed2f0ebf78bf8387a Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Mon, 28 Oct 2024 15:29:18 +0000
Subject: [PATCH 07/14] Apply clang-format

---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 107 +++++++++---------
 .../Target/AArch64/AArch64ISelLowering.cpp    |   7 +-
 2 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index bfdae30722548..098d7ead3456e 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -906,18 +906,23 @@ ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
   auto *Inst = cast<Instruction>(V);
 
-  if(!TL->isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation::CDot, Inst->getType())) {
-    LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving operation CDot with the type " << *Inst->getType() << "\n");
+  if (!TL->isComplexDeinterleavingOperationSupported(
+          ComplexDeinterleavingOperation::CDot, Inst->getType())) {
+    LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
+                         "operation CDot with the type "
+                      << *Inst->getType() << "\n");
     return nullptr;
   }
 
   auto *RealUser = cast<Instruction>(*Inst->user_begin());
 
-  NodePtr CN = prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
+  NodePtr CN =
+      prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
 
   NodePtr ANode;
 
-  const Intrinsic::ID PartialReduceInt = Intrinsic::experimental_vector_partial_reduce_add;
+  const Intrinsic::ID PartialReduceInt =
+      Intrinsic::experimental_vector_partial_reduce_add;
 
   Value *AReal = nullptr;
   Value *AImag = nullptr;
@@ -925,56 +930,49 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
   Value *BImag = nullptr;
   Value *Phi = nullptr;
 
-  auto UnwrapCast = [](Value *V) -> Value* {
-    if(auto *CI = dyn_cast<CastInst>(V)) 
+  auto UnwrapCast = [](Value *V) -> Value * {
+    if (auto *CI = dyn_cast<CastInst>(V))
       return CI->getOperand(0);
     return V;
   };
-  
-  auto PatternRot0 =
-        m_Intrinsic<PartialReduceInt>(
-          m_Intrinsic<PartialReduceInt>(
-            m_Value(Phi),
-            m_Mul(m_Value(BReal), m_Value(AReal))),
-          m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
-
-  auto PatternRot270 =
-        m_Intrinsic<PartialReduceInt>(
-          m_Intrinsic<PartialReduceInt>(
-            m_Value(Phi),
-            m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
-          m_Mul(m_Value(BImag), m_Value(AReal)));
-
-  if(match(Inst, PatternRot0)) {
+
+  auto PatternRot0 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                    m_Mul(m_Value(BReal), m_Value(AReal))),
+      m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
+
+  auto PatternRot270 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(
+          m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
+      m_Mul(m_Value(BImag), m_Value(AReal)));
+
+  if (match(Inst, PatternRot0)) {
     CN->Rotation = ComplexDeinterleavingRotation::Rotation_0;
-  }else if(match(Inst, PatternRot270)) {
+  } else if (match(Inst, PatternRot270)) {
     CN->Rotation = ComplexDeinterleavingRotation::Rotation_270;
-  }else {
+  } else {
     Value *A0, *A1;
     // The rotations 90 and 180 share the same operation pattern, so inspect the
-    // order of the operands, identifying where the real and imaginary components
-    // of A go, to discern between the aforementioned rotations.
-    auto PatternRot90Rot180 = 
-          m_Intrinsic<PartialReduceInt>(
-            m_Intrinsic<PartialReduceInt>(
-              m_Value(Phi),
-              m_Mul(m_Value(BReal), m_Value(A0))
-            ),
-            m_Mul(m_Value(BImag), m_Value(A1)));
-    
-    if(!match(Inst, PatternRot90Rot180)) 
+    // order of the operands, identifying where the real and imaginary
+    // components of A go, to discern between the aforementioned rotations.
+    auto PatternRot90Rot180 = m_Intrinsic<PartialReduceInt>(
+        m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                      m_Mul(m_Value(BReal), m_Value(A0))),
+        m_Mul(m_Value(BImag), m_Value(A1)));
+
+    if (!match(Inst, PatternRot90Rot180))
       return nullptr;
-    
+
     A0 = UnwrapCast(A0);
     A1 = UnwrapCast(A1);
 
     // Test if A0 is real/A1 is imag
     ANode = identifyNode(A0, A1);
-    if(!ANode) {
+    if (!ANode) {
       // Test if A0 is imag/A1 is real
       ANode = identifyNode(A1, A0);
       // Unable to identify operand components, thus unable to identify rotation
-      if(!ANode)
+      if (!ANode)
         return nullptr;
       CN->Rotation = ComplexDeinterleavingRotation::Rotation_90;
       AReal = A1;
@@ -994,10 +992,14 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
   bool WasANodeFromCache = false;
   NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache);
 
-  // In the case that a node was identified to figure out the rotation, ensure that trying to
-  // identify a node with AReal and AImag post-unwrap results in the same node
-  if(Node && ANode && !WasANodeFromCache) {
-    LLVM_DEBUG(dbgs() << "Identified node is different from previously identified node. Unable to confidently generate a complex operation node\n");
+  // In the case that a node was identified to figure out the rotation, ensure
+  // that trying to identify a node with AReal and AImag post-unwrap results in
+  // the same node
+  if (Node && ANode && !WasANodeFromCache) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Identified node is different from previously identified node. "
+           "Unable to confidently generate a complex operation node\n");
     return nullptr;
   }
 
@@ -1014,18 +1016,19 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
     return nullptr;
 
   VectorType *RealTy = dyn_cast<VectorType>(R->getType());
-  if(!RealTy)
+  if (!RealTy)
     return nullptr;
   VectorType *ImagTy = dyn_cast<VectorType>(I->getType());
-  if(!ImagTy)
+  if (!ImagTy)
     return nullptr;
 
-  if(RealTy->isScalableTy() != ImagTy->isScalableTy())
+  if (RealTy->isScalableTy() != ImagTy->isScalableTy())
     return nullptr;
-  if(RealTy->getElementType() != ImagTy->getElementType())
+  if (RealTy->getElementType() != ImagTy->getElementType())
     return nullptr;
 
-  // `I` is known to only have one user, so iterate over the Phi (R) users to find the common user between R and I
+  // `I` is known to only have one user, so iterate over the Phi (R) users to
+  // find the common user between R and I
   auto *CommonUser = *I->user_begin();
   bool CommonUserFound = false;
   for (auto *User : R->users()) {
@@ -1039,10 +1042,11 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
     return nullptr;
 
   auto *IInst = dyn_cast<IntrinsicInst>(CommonUser);
-  if (!IInst || IInst->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
+  if (!IInst || IInst->getIntrinsicID() !=
+                    Intrinsic::experimental_vector_partial_reduce_add)
     return nullptr;
 
-  if(NodePtr CN = identifyDotProduct(IInst))
+  if (NodePtr CN = identifyDotProduct(IInst))
     return CN;
 
   return nullptr;
@@ -1063,7 +1067,7 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
     return It->second;
   }
 
-  if(NodePtr CN = identifyPartialReduction(R, I))
+  if (NodePtr CN = identifyPartialReduction(R, I))
     return CN;
 
   bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
@@ -2138,8 +2142,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     assert(!Input1 || (Input0->getType() == Input1->getType() &&
                        "Node inputs need to be of the same type"));
     ReplacementNode = TL->createComplexDeinterleavingIR(
-        Builder, Node->Operation, Node->Rotation, Input0, Input1,
-        Accumulator);
+        Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
     break;
   }
   case ComplexDeinterleavingOperation::CAdd:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a2dcf7f851b55..c89beb7233af6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29317,10 +29317,11 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
   }
 
-  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && IsScalable) {
+  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
+      IsScalable) {
     return B.CreateIntrinsic(
-            Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
-            {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+        Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
+        {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
   }
 
   return nullptr;

From 1bf2f2ed7f5c5cfa9d851c7b848217a63a6ccbc7 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 29 Oct 2024 16:07:52 +0000
Subject: [PATCH 08/14] Change identifyNode parameter to use bool* instead of
 bool&

Remove debug comments
---
 llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 098d7ead3456e..11f5813c33b0f 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -338,8 +338,7 @@ class ComplexDeinterleavingGraph {
   NodePtr identifyPartialReduction(Value *R, Value *I);
   NodePtr identifyDotProduct(Value *Inst);
 
-  NodePtr identifyNode(Value *R, Value *I);
-  NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
+  NodePtr identifyNode(Value *R, Value *I, bool *FromCache = nullptr);
 
   /// Determine if a sum of complex numbers can be formed from \p RealAddends
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -990,7 +989,7 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
   BImag = UnwrapCast(BImag);
 
   bool WasANodeFromCache = false;
-  NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache);
+  NodePtr Node = identifyNode(AReal, AImag, &WasANodeFromCache);
 
   // In the case that a node was identified to figure out the rotation, ensure
   // that trying to identify a node with AReal and AImag post-unwrap results in
@@ -1053,17 +1052,12 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  bool _;
-  return identifyNode(R, I, _);
-}
-
-ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool *FromCache) {
   auto It = CachedResult.find({R, I});
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
-    FromCache = true;
+    if (FromCache != nullptr)
+      *FromCache = true;
     return It->second;
   }
 
@@ -1690,7 +1684,6 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
 }
 
 void ComplexDeinterleavingGraph::identifyReductionNodes() {
-  dbgs() << "identifyReductionNodes\n";
   SmallVector<bool> Processed(ReductionInfo.size(), false);
   SmallVector<Instruction *> OperationInstruction;
   for (auto &P : ReductionInfo)
@@ -1746,7 +1739,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
     RealPHI = ReductionInfo[Real].first;
     ImagPHI = nullptr;
     PHIsFound = false;
-    dbgs() << "identifyNode from Phi " << *RealPHI << " / " << *Real << "\n";
     auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
     if (Node && PHIsFound) {
       LLVM_DEBUG(

From b19c99f23ba189ae8fc54d4c23ebf9d64c38a404 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 29 Oct 2024 16:38:11 +0000
Subject: [PATCH 09/14] Simplify complex-deinterleaving-cdot.ll

---
 .../AArch64/complex-deinterleaving-cdot.ll    | 190 ++++--------------
 1 file changed, 39 insertions(+), 151 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index 6277f9a3842bb..6e2a2044db81f 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -4,167 +4,55 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
-define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+define i32 @cdotp() {
 ; CHECK-LABEL: define i32 @cdotp(
-; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
-; CHECK-NEXT:    br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]]
-; CHECK-NEXT:    [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32
-; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]]
-; CHECK-NEXT:    [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16)
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0)
-; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0)
-; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0)
-; CHECK-NEXT:    [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i32 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP11]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP15]] to i32
-; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; CHECK-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1
-; CHECK-NEXT:    [[CONV10:%.*]] = sext i8 [[TMP18]] to i32
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1
-; CHECK-NEXT:    [[CONV15:%.*]] = sext i8 [[TMP19]] to i32
-; CHECK-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]]
-; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]]
-; CHECK-NEXT:    [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]]
-; CHECK-NEXT:    [[SUB]] = sub i32 [[ADD17]], [[MUL18]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP11]])
+; CHECK-NEXT:    ret i32 [[TMP12]]
 ;
 entry:
-  %cmp28.not = icmp ult i32 %N, 2
-  br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader
-for.body.preheader:                               ; preds = %entry
-  %div27 = lshr i32 %N, 1
-  %wide.trip.count = zext nneg i32 %div27 to i64
-  %0 = call i64 @llvm.vscale.i64()
-  %1 = mul i64 %0, 16
-  %min.iters.check = icmp ult i64 %wide.trip.count, %1
-  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-vector.ph:                                        ; preds = %for.body.preheader
-  %2 = call i64 @llvm.vscale.i64()
-  %3 = mul i64 %2, 16
-  %n.mod.vf = urem i64 %wide.trip.count, %3
-  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
-  %4 = call i64 @llvm.vscale.i64()
-  %5 = mul i64 %4, 16
   br label %vector.body
-vector.body:
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ]
-  %index.i = shl nuw nsw i64 %index, 1
-  %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i
-  %a.load = load <vscale x 32 x i8>, ptr %a.ptr
-  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load)
-  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
-  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
-  %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i
-  %b.load = load <vscale x 32 x i8>, ptr %b.ptr
-  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load)
-  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
-  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
-  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
-  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
-  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
-  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
-  %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
   %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
-  %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
   %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
-  %index.next = add nuw i64 %index, %5
-  %22 = icmp eq i64 %index.next, %n.vec
-  br i1 %22, label %middle.block, label %vector.body
+  br i1 true, label %middle.block, label %vector.body
+
 middle.block:                                     ; preds = %vector.body
-  %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
-  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
-  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
-scalar.ph:                                        ; preds = %middle.block, %for.body.preheader
-  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
-  %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ]
-  br label %for.body
-for.cond.cleanup.loopexit:                        ; preds = %middle.block, %for.body
-  %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ]
-  br label %for.cond.cleanup
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %res.0.lcssa
-for.body:                                         ; preds = %scalar.ph, %for.body
-  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-  %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ]
-  %26 = shl nuw nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26
-  %27 = load i8, ptr %arrayidx, align 1
-  %conv = sext i8 %27 to i32
-  %28 = or disjoint i64 %26, 1
-  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28
-  %29 = load i8, ptr %arrayidx4, align 1
-  %conv5 = sext i8 %29 to i32
-  %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26
-  %30 = load i8, ptr %arrayidx9, align 1
-  %conv10 = sext i8 %30 to i32
-  %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28
-  %31 = load i8, ptr %arrayidx14, align 1
-  %conv15 = sext i8 %31 to i32
-  %mul16 = mul nsw i32 %conv10, %conv
-  %add17 = add nsw i32 %mul16, %res.030
-  %mul18 = mul nsw i32 %conv15, %conv5
-  %sub = sub i32 %add17, %mul18
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
 }
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #1
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

From fc68abee7a1eaa79e8a57808cbb820ba9cbbccb3 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Fri, 8 Nov 2024 10:41:52 +0000
Subject: [PATCH 10/14] Add additional test cases for cdot

---
 .../AArch64/complex-deinterleaving-cdot.ll    | 813 +++++++++++++++++-
 1 file changed, 786 insertions(+), 27 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index 6e2a2044db81f..72722260a2f78 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -1,41 +1,100 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
-define i32 @cdotp() {
-; CHECK-LABEL: define i32 @cdotp(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i32 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i32 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
-; CHECK-NEXT:    [[TMP11]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP11]])
-; CHECK-NEXT:    ret i32 [[TMP12]]
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
-  %a.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
-  %a.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
-  %b.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
-  %b.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32>
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
   %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
@@ -48,11 +107,711 @@ middle.block:                                     ; preds = %vector.body
   ret i32 %0
 }
 
+define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
 declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>) #0
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #1
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>) #1
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

From 97eff4d20c933df2afe5b7c533afc13b69f4a9ef Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Mon, 25 Nov 2024 10:54:41 +0000
Subject: [PATCH 11/14] Add negative test case for cdot

---
 .../AArch64/complex-deinterleaving-cdot.ll    | 123 ++++++++++++++++--
 1 file changed, 113 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index 72722260a2f78..c9865558d94c2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -803,15 +803,118 @@ middle.block:                                     ; preds = %vector.body
 }
 
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #0
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>) #0
+define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
 
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #1
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>) #1
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
 
-attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)

From 646ffe3b42e96a428b372ddbde0b2fe5001b968b Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 10 Dec 2024 15:03:17 +0000
Subject: [PATCH 12/14] Address comments, and refactor where certain checks are
 performed

---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp |  73 +++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +
 .../AArch64/complex-deinterleaving-cdot.ll    | 216 ++++++++++++++++++
 3 files changed, 255 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 95a9768da27df..b9fbce6955e7a 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -108,8 +108,15 @@ static bool isNeg(Value *V);
 static Value *getNegOperand(Value *V);
 
 namespace {
+  template<typename T, typename IterT>
+  std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
+    auto Common = llvm::find_if(A, [B](T I){return llvm::is_contained(B, I);});
+    if (Common != A.end())
+      return std::make_optional(*Common);
+    return std::nullopt;
+  }
 
-class ComplexDeinterleavingLegacyPass : public FunctionPass {
+  class ComplexDeinterleavingLegacyPass : public FunctionPass {
 public:
   static char ID;
 
@@ -337,7 +344,7 @@ class ComplexDeinterleavingGraph {
   NodePtr identifyPartialReduction(Value *R, Value *I);
   NodePtr identifyDotProduct(Value *Inst);
 
-  NodePtr identifyNode(Value *R, Value *I, bool *FromCache = nullptr);
+  NodePtr identifyNode(Value *R, Value *I);
 
   /// Determine if a sum of complex numbers can be formed from \p RealAddends
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -902,16 +909,16 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
-  auto *Inst = cast<Instruction>(V);
 
   if (!TL->isComplexDeinterleavingOperationSupported(
-          ComplexDeinterleavingOperation::CDot, Inst->getType())) {
+          ComplexDeinterleavingOperation::CDot, V->getType())) {
     LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
                          "operation CDot with the type "
-                      << *Inst->getType() << "\n");
+                      << *V->getType() << "\n");
     return nullptr;
   }
 
+  auto *Inst = cast<Instruction>(V);
   auto *RealUser = cast<Instruction>(*Inst->user_begin());
 
   NodePtr CN =
@@ -987,13 +994,26 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
   BReal = UnwrapCast(BReal);
   BImag = UnwrapCast(BImag);
 
-  bool WasANodeFromCache = false;
-  NodePtr Node = identifyNode(AReal, AImag, &WasANodeFromCache);
+  VectorType *VTy = cast<VectorType>(V->getType());
+  Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2);
+  if (AReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (AImag->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BImag->getType() != ExpectedOperandTy)
+    return nullptr;
+
+  if (Phi->getType() != VTy && RealUser->getType() != VTy)
+    return nullptr;
+
+  NodePtr Node = identifyNode(AReal, AImag);
 
   // In the case that a node was identified to figure out the rotation, ensure
   // that trying to identify a node with AReal and AImag post-unwrap results in
   // the same node
-  if (Node && ANode && !WasANodeFromCache) {
+  if (ANode && Node != ANode) {
     LLVM_DEBUG(
         dbgs()
         << "Identified node is different from previously identified node. "
@@ -1010,38 +1030,17 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
 
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
-  if (!I->hasOneUser())
+  // Partial reductions don't support non-vector types, so check these first
+  if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
     return nullptr;
 
-  VectorType *RealTy = dyn_cast<VectorType>(R->getType());
-  if (!RealTy)
-    return nullptr;
-  VectorType *ImagTy = dyn_cast<VectorType>(I->getType());
-  if (!ImagTy)
-    return nullptr;
-
-  if (RealTy->isScalableTy() != ImagTy->isScalableTy())
-    return nullptr;
-  if (RealTy->getElementType() != ImagTy->getElementType())
-    return nullptr;
-
-  // `I` is known to only have one user, so iterate over the Phi (R) users to
-  // find the common user between R and I
-  auto *CommonUser = *I->user_begin();
-  bool CommonUserFound = false;
-  for (auto *User : R->users()) {
-    if (User == CommonUser) {
-      CommonUserFound = true;
-      break;
-    }
-  }
-
-  if (!CommonUserFound)
+  auto CommonUser = findCommonBetweenCollections<Value*>(R->users(), I->users());
+  if (!CommonUser)
     return nullptr;
 
-  auto *IInst = dyn_cast<IntrinsicInst>(CommonUser);
+  auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
   if (!IInst || IInst->getIntrinsicID() !=
-                    Intrinsic::experimental_vector_partial_reduce_add)
+      Intrinsic::experimental_vector_partial_reduce_add)
     return nullptr;
 
   if (NodePtr CN = identifyDotProduct(IInst))
@@ -1051,12 +1050,10 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool *FromCache) {
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
   auto It = CachedResult.find({R, I});
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
-    if (FromCache != nullptr)
-      *FromCache = true;
     return It->second;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c66ff590f2aa9..423395a994587 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29414,6 +29414,10 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
     return 8 <= ScalarWidth && ScalarWidth <= 64;
   }
 
+  // CDot is not supported outside of scalable/sve scopes
+  if (Operation == ComplexDeinterleavingOperation::CDot)
+    return false;
+
   return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
          ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
 }
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index c9865558d94c2..b02e5972f54e5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -913,8 +913,224 @@ middle.block:                                     ; preds = %vector.body
   ret i32 %0
 }
 
+define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i16 @invalid_type(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i16 @invalid_type(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i16 @invalid_type(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
+  ret i16 %0
+}
+
+define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
+  %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
+  %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
+  %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
+  %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
+  %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
+  %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
+  %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
 declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
 declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
 
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
 declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)

From b3550a588a6b2087c27bdc89f0c849951cfb341f Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 10 Dec 2024 15:40:52 +0000
Subject: [PATCH 13/14] Fix formatting

---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index b9fbce6955e7a..3111354addacd 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -108,15 +108,15 @@ static bool isNeg(Value *V);
 static Value *getNegOperand(Value *V);
 
 namespace {
-  template<typename T, typename IterT>
-  std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
-    auto Common = llvm::find_if(A, [B](T I){return llvm::is_contained(B, I);});
-    if (Common != A.end())
-      return std::make_optional(*Common);
-    return std::nullopt;
-  }
+template <typename T, typename IterT>
+std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
+  auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
+  if (Common != A.end())
+    return std::make_optional(*Common);
+  return std::nullopt;
+}
 
-  class ComplexDeinterleavingLegacyPass : public FunctionPass {
+class ComplexDeinterleavingLegacyPass : public FunctionPass {
 public:
   static char ID;
 
@@ -207,7 +207,7 @@ struct ComplexDeinterleavingCompositeNode {
     }
   }
 
-  bool AreOperandsValid() { return OperandsValid; }
+  bool areOperandsValid() { return OperandsValid; }
 };
 
 class ComplexDeinterleavingGraph {
@@ -1034,13 +1034,14 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
   if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
     return nullptr;
 
-  auto CommonUser = findCommonBetweenCollections<Value*>(R->users(), I->users());
+  auto CommonUser =
+      findCommonBetweenCollections<Value *>(R->users(), I->users());
   if (!CommonUser)
     return nullptr;
 
   auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
   if (!IInst || IInst->getIntrinsicID() !=
-      Intrinsic::experimental_vector_partial_reduce_add)
+                    Intrinsic::experimental_vector_partial_reduce_add)
     return nullptr;
 
   if (NodePtr CN = identifyDotProduct(IInst))
@@ -1756,7 +1757,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 bool ComplexDeinterleavingGraph::checkNodes() {
 
   for (NodePtr N : CompositeNodes) {
-    if (!N->AreOperandsValid())
+    if (!N->areOperandsValid())
       return false;
   }
 

From 0329be676d34474594cc4e1002d1ad93be42a585 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy@arm.com>
Date: Tue, 17 Dec 2024 17:00:58 +0000
Subject: [PATCH 14/14] Update test name

---
 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index b02e5972f54e5..11cf4c31936d8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -1019,8 +1019,8 @@ middle.block:                                     ; preds = %vector.body
   ret i16 %0
 }
 
-define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
 ; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
 ; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -1046,7 +1046,7 @@ define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
 ; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
 ;
-; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
 ; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
 ; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -1072,7 +1072,7 @@ define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
 ; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
 ;
-; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length(
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
 ; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
 ; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]