From e899c47c8748bc516f8b0964847f2f1438f5f120 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 8 Oct 2024 14:53:36 +0100 Subject: [PATCH 01/14] Add support for single reductions in ComplexDeinterleavingPass --- .../llvm/CodeGen/ComplexDeinterleavingPass.h | 1 + .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 121 +++++++++++-- .../Target/AArch64/AArch64ISelLowering.cpp | 19 +- .../AArch64/complex-deinterleaving-cdot.ll | 170 ++++++++++++++++++ 4 files changed, 288 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index 84a2673fecb5b..a3fa219772770 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -43,6 +43,7 @@ enum class ComplexDeinterleavingOperation { ReductionPHI, ReductionOperation, ReductionSelect, + ReductionSingle }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 8573b016d1e5b..08287a4d5ed02 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -145,6 +145,7 @@ struct ComplexDeinterleavingCompositeNode { friend class ComplexDeinterleavingGraph; using NodePtr = std::shared_ptr; using RawNodePtr = ComplexDeinterleavingCompositeNode *; + bool OperandsValid = true; public: ComplexDeinterleavingOperation Operation; @@ -161,7 +162,11 @@ struct ComplexDeinterleavingCompositeNode { SmallVector Operands; Value *ReplacementNode = nullptr; - void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + void addOperand(NodePtr Node) { + if (!Node || !Node.get()) + OperandsValid = false; + Operands.push_back(Node.get()); + } void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { @@ -195,6 +200,10 @@ struct ComplexDeinterleavingCompositeNode { PrintNodeRef(Op); } } + + bool AreOperandsValid() { + return OperandsValid; + } }; class ComplexDeinterleavingGraph { @@ -294,7 +303,7 @@ class ComplexDeinterleavingGraph { NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - if (Node->Real && Node->Imag) + if (Node->Real) CachedResult[{Node->Real, Node->Imag}] = Node; return Node; } @@ -328,8 +337,10 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifyPartialReduction(Value *R, Value *I); NodePtr identifyNode(Value *R, Value *I); + NodePtr identifyNode(Value *R, Value *I, bool &FromCache); /// Determine if a sum of complex numbers can be formed from \p RealAddends /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. @@ -397,6 +408,7 @@ class ComplexDeinterleavingGraph { /// * Deinterleave the final value outside of the loop and repurpose original /// reduction users void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); + void processReductionSingle(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -893,16 +905,26 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); - assert(R->getType() == I->getType() && - "Real and imaginary parts should not have different types"); + bool _; + return identifyNode(R, I, _); +} +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); + FromCache = true; return It->second; } + if(NodePtr CN = identifyPartialReduction(R, I)) + return CN; + + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); + if(!IsReduction && R->getType() != I->getType()) + return nullptr; + if (NodePtr CN = identifySplat(R, I)) return CN; @@ -1428,12 +1450,18 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { if (It != RootToNode.end()) { auto RootNode = It->second; assert(RootNode->Operation == - ComplexDeinterleavingOperation::ReductionOperation); + ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. auto *R = cast(RootNode->Real); - auto *I = cast(RootNode->Imag); - auto *ReplacementAnchor = R->comesBefore(I) ? I : R; + auto *I = RootNode->Imag ? cast(RootNode->Imag) : nullptr; + + Instruction *ReplacementAnchor; + if(I) + ReplacementAnchor = R->comesBefore(I) ? I : R; + else + ReplacementAnchor = R; + if (ReplacementAnchor != RootI) return false; OrderedRoots.push_back(RootI); @@ -1521,11 +1549,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t i = 0; i < OperationInstruction.size(); ++i) { if (Processed[i]) continue; + auto *Real = OperationInstruction[i]; for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - - auto *Real = OperationInstruction[i]; + auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) continue; @@ -1557,6 +1585,25 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { break; } } + + // We want to check that we have 2 operands, but the function attributes + // being counted as operands bloats this value. + if(Real->getNumOperands() < 2) + continue; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = nullptr; + PHIsFound = false; + auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); + if(Node && PHIsFound) { + LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: " + << *Real << "/" << *ReductionInfo[Real].second << "\n"); + Processed[i] = true; + auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + submitCompositeNode(RootNode); + } } RealPHI = nullptr; @@ -1564,6 +1611,12 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } bool ComplexDeinterleavingGraph::checkNodes() { + + for (NodePtr N : CompositeNodes) { + if (!N->AreOperandsValid()) + return false; + } + // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; SmallVector Worklist; @@ -1832,7 +1885,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { - if (Real != RealPHI || Imag != ImagPHI) + if (Real != RealPHI || (ImagPHI && Imag != ImagPHI)) return nullptr; PHIsFound = true; @@ -1970,13 +2023,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. + auto *OldPHI = cast(Node->Real); auto *VTy = cast(Node->Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt()); - OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; + OldToNewPHI[OldPHI] = NewPHI; ReplacementNode = NewPHI; break; } + case ComplexDeinterleavingOperation::ReductionSingle: + ReplacementNode = replaceNode(Builder, Node->Operands[0]); + processReductionSingle(ReplacementNode, Node); + break; case ComplexDeinterleavingOperation::ReductionOperation: ReplacementNode = replaceNode(Builder, Node->Operands[0]); processReductionOperation(ReplacementNode, Node); @@ -2001,6 +2059,37 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, return ReplacementNode; } +void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) { + auto *Real = cast(Node->Real); + auto *OldPHI = ReductionInfo[Real].first; + auto *NewPHI = OldToNewPHI[OldPHI]; + auto *VTy = cast(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + Value *Init = OldPHI->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + + Value *NewInit = nullptr; + if(auto *C = dyn_cast(Init)) { + if(C->isZeroValue()) + NewInit = Constant::getNullValue(NewVTy); + } + + if (!NewInit) + NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, + {Init, Constant::getNullValue(VTy)}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(OperationReplacement, BackEdge); + + auto *FinalReduction = ReductionInfo[Real].second; + Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt()); + // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction` + auto *AddReduce = Builder.CreateAddReduce(OperationReplacement); + FinalReduction->replaceAllUsesWith(AddReduce); +} + void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); @@ -2060,8 +2149,12 @@ void ComplexDeinterleavingGraph::replaceNodes() { auto *RootImag = cast(RootNode->Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); - DeadInstrRoots.push_back(cast(RootReal)); - DeadInstrRoots.push_back(cast(RootImag)); + DeadInstrRoots.push_back(RootReal); + DeadInstrRoots.push_back(RootImag); + } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) { + auto *RootInst = cast(RootNode->Real); + ReductionInfo[RootInst].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(ReductionInfo[RootInst].second); } else { assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7448416c682ab..7c3c32643ed64 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29217,6 +29217,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); + if (Accumulator == nullptr) + Accumulator = Constant::getNullValue(Ty); bool IsScalable = Ty->isScalableTy(); bool IsInt = Ty->getElementType()->isIntegerTy(); @@ -29228,6 +29230,7 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; + int AccStride = cast(Accumulator->getType())->getElementCount().getKnownMinValue() / 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); @@ -29237,25 +29240,23 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; - if (Accumulator) { - LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); + Type *FullTy = Ty; + FullTy = Accumulator->getType(); + auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast(Accumulator->getType())); + LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); UpperSplitAcc = - B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); - } + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, + auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt, B.getInt64(0)); - return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); + return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { - if (Accumulator == nullptr) - Accumulator = Constant::getNullValue(Ty); - if (IsScalable) { if (IsInt) return B.CreateIntrinsic( diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll new file mode 100644 index 0000000000000..6277f9a3842bb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: define i32 @cdotp( +; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]] +; CHECK-NEXT: [[A_LOAD:%.*]] = load , ptr [[A_PTR]], align 32 +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load , ptr [[B_PTR]], align 32 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 16) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 16) +; CHECK-NEXT: [[VEC_PHI:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[VEC_PHI]], [[TMP6]], [[TMP7]], i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP13]], [[TMP8]], [[TMP9]], i32 0) +; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP20]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP22]], [[TMP21]], i64 4) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP20]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1 +; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]] +; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]] +; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]] +; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader +for.body.preheader: ; preds = %entry + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + %0 = call i64 @llvm.vscale.i64() + %1 = mul i64 %0, 16 + %min.iters.check = icmp ult i64 %wide.trip.count, %1 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph +vector.ph: ; preds = %for.body.preheader + %2 = call i64 @llvm.vscale.i64() + %3 = mul i64 %2, 16 + %n.mod.vf = urem i64 %wide.trip.count, %3 + %n.vec = sub i64 %wide.trip.count, %n.mod.vf + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 16 + br label %vector.body +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ] + %index.i = shl nuw nsw i64 %index, 1 + %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i + %a.load = load , ptr %a.ptr + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %a.load) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i + %b.load = load , ptr %b.ptr + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %b.load) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul nsw %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul nsw %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + %index.next = add nuw i64 %index, %5 + %22 = icmp eq i64 %index.next, %n.vec + br i1 %22, label %middle.block, label %vector.body +middle.block: ; preds = %vector.body + %25 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + %cmp.n = icmp eq i64 %wide.trip.count, %n.vec + br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph +scalar.ph: ; preds = %middle.block, %for.body.preheader + %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ] + %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ] + br label %for.body +for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body + %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ] + br label %for.cond.cleanup +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %res.0.lcssa +for.body: ; preds = %scalar.ph, %for.body + %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] + %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ] + %26 = shl nuw nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26 + %27 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %27 to i32 + %28 = or disjoint i64 %26, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28 + %29 = load i8, ptr %arrayidx4, align 1 + %conv5 = sext i8 %29 to i32 + %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26 + %30 = load i8, ptr %arrayidx9, align 1 + %conv10 = sext i8 %30 to i32 + %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28 + %31 = load i8, ptr %arrayidx14, align 1 + %conv15 = sext i8 %31 to i32 + %mul16 = mul nsw i32 %conv10, %conv + %add17 = add nsw i32 %mul16, %res.030 + %mul18 = mul nsw i32 %conv15, %conv5 + %sub = sub i32 %add17, %mul18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} From fb22e229f48e6008a798aeff44d0ec56ab157e49 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Fri, 18 Oct 2024 11:14:49 +0100 Subject: [PATCH 02/14] Apply clang-format --- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 46 ++++++++++--------- .../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++---- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 08287a4d5ed02..3a5436714715b 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -201,9 +201,7 @@ struct ComplexDeinterleavingCompositeNode { } } - bool AreOperandsValid() { - return OperandsValid; - } + bool AreOperandsValid() { return OperandsValid; } }; class ComplexDeinterleavingGraph { @@ -918,11 +916,11 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { return It->second; } - if(NodePtr CN = identifyPartialReduction(R, I)) + if (NodePtr CN = identifyPartialReduction(R, I)) return CN; bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); - if(!IsReduction && R->getType() != I->getType()) + if (!IsReduction && R->getType() != I->getType()) return nullptr; if (NodePtr CN = identifySplat(R, I)) @@ -1450,18 +1448,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { if (It != RootToNode.end()) { auto RootNode = It->second; assert(RootNode->Operation == - ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle); + ComplexDeinterleavingOperation::ReductionOperation || + RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. auto *R = cast(RootNode->Real); auto *I = RootNode->Imag ? cast(RootNode->Imag) : nullptr; Instruction *ReplacementAnchor; - if(I) + if (I) ReplacementAnchor = R->comesBefore(I) ? I : R; - else + else ReplacementAnchor = R; - + if (ReplacementAnchor != RootI) return false; OrderedRoots.push_back(RootI); @@ -1553,7 +1553,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - + auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) continue; @@ -1588,18 +1588,20 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { // We want to check that we have 2 operands, but the function attributes // being counted as operands bloats this value. - if(Real->getNumOperands() < 2) + if (Real->getNumOperands() < 2) continue; RealPHI = ReductionInfo[Real].first; ImagPHI = nullptr; PHIsFound = false; auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); - if(Node && PHIsFound) { - LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: " - << *Real << "/" << *ReductionInfo[Real].second << "\n"); + if (Node && PHIsFound) { + LLVM_DEBUG( + dbgs() << "Identified single reduction starting from instruction: " + << *Real << "/" << *ReductionInfo[Real].second << "\n"); Processed[i] = true; - auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); RootNode->addOperand(Node); RootToNode[Real] = RootNode; submitCompositeNode(RootNode); @@ -2059,7 +2061,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, return ReplacementNode; } -void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) { +void ComplexDeinterleavingGraph::processReductionSingle( + Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); auto *OldPHI = ReductionInfo[Real].first; auto *NewPHI = OldToNewPHI[OldPHI]; @@ -2071,21 +2074,21 @@ void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacem IRBuilder<> Builder(Incoming->getTerminator()); Value *NewInit = nullptr; - if(auto *C = dyn_cast(Init)) { - if(C->isZeroValue()) + if (auto *C = dyn_cast(Init)) { + if (C->isZeroValue()) NewInit = Constant::getNullValue(NewVTy); } if (!NewInit) NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {Init, Constant::getNullValue(VTy)}); + {Init, Constant::getNullValue(VTy)}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); auto *FinalReduction = ReductionInfo[Real].second; Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt()); - // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction` + auto *AddReduce = Builder.CreateAddReduce(OperationReplacement); FinalReduction->replaceAllUsesWith(AddReduce); } @@ -2151,7 +2154,8 @@ void ComplexDeinterleavingGraph::replaceNodes() { ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); DeadInstrRoots.push_back(RootReal); DeadInstrRoots.push_back(RootImag); - } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) { + } else if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle) { auto *RootInst = cast(RootNode->Real); ReductionInfo[RootInst].first->removeIncomingValue(BackEdge); DeadInstrRoots.push_back(ReductionInfo[RootInst].second); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7c3c32643ed64..869e4e48427e8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29230,7 +29230,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; - int AccStride = cast(Accumulator->getType())->getElementCount().getKnownMinValue() / 2; + int AccStride = cast(Accumulator->getType()) + ->getElementCount() + .getKnownMinValue() / + 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); @@ -29241,19 +29244,22 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; Type *FullTy = Ty; - FullTy = Accumulator->getType(); - auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast(Accumulator->getType())); - LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); - UpperSplitAcc = - B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); + FullTy = Accumulator->getType(); + auto *HalfAccTy = VectorType::getHalfElementsVectorType( + cast(Accumulator->getType())); + LowerSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); + UpperSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt, - B.getInt64(0)); - return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride)); + auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), + LowerSplitInt, B.getInt64(0)); + return B.CreateInsertVector(FullTy, Result, UpperSplitInt, + B.getInt64(AccStride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { From 42fba2826094bd5cd6ec2d8f82c8588b76ea3bb9 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Fri, 18 Oct 2024 13:14:33 +0100 Subject: [PATCH 03/14] Remove erroneously added function and call --- llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 3a5436714715b..18ad74aa9bae1 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -335,7 +335,6 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); - NodePtr identifyPartialReduction(Value *R, Value *I); NodePtr identifyNode(Value *R, Value *I); NodePtr identifyNode(Value *R, Value *I, bool &FromCache); @@ -916,8 +915,6 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { return It->second; } - if (NodePtr CN = identifyPartialReduction(R, I)) - return CN; bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); if (!IsReduction && R->getType() != I->getType()) From 918312c13b722a45a138cf126a567876efaf22a5 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 22 Oct 2024 14:32:57 +0100 Subject: [PATCH 04/14] Fix case where it fails to identify unrolled reductions Also removed prematurely-added test --- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 4 +- .../AArch64/complex-deinterleaving-cdot.ll | 170 ------------------ 2 files changed, 2 insertions(+), 172 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 18ad74aa9bae1..edad678e4d0c0 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -1546,11 +1546,10 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t i = 0; i < OperationInstruction.size(); ++i) { if (Processed[i]) continue; - auto *Real = OperationInstruction[i]; for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - + auto *Real = OperationInstruction[i]; auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) continue; @@ -1583,6 +1582,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } } + auto *Real = OperationInstruction[i]; // We want to check that we have 2 operands, but the function attributes // being counted as operands bloats this value. if (Real->getNumOperands() < 2) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll deleted file mode 100644 index 6277f9a3842bb..0000000000000 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ /dev/null @@ -1,170 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { -; CHECK-LABEL: define i32 @cdotp( -; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]] -; CHECK-NEXT: [[A_LOAD:%.*]] = load , ptr [[A_PTR]], align 32 -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]] -; CHECK-NEXT: [[B_LOAD:%.*]] = load , ptr [[B_PTR]], align 32 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 16) -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 16) -; CHECK-NEXT: [[VEC_PHI:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[VEC_PHI]], [[TMP6]], [[TMP7]], i32 0) -; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP13]], [[TMP8]], [[TMP9]], i32 0) -; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP20]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP22]], [[TMP21]], i64 4) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP20]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] -; CHECK: [[FOR_COND_CLEANUP]]: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1 -; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32 -; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]] -; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]] -; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]] -; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader -for.body.preheader: ; preds = %entry - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - %0 = call i64 @llvm.vscale.i64() - %1 = mul i64 %0, 16 - %min.iters.check = icmp ult i64 %wide.trip.count, %1 - br i1 %min.iters.check, label %scalar.ph, label %vector.ph -vector.ph: ; preds = %for.body.preheader - %2 = call i64 @llvm.vscale.i64() - %3 = mul i64 %2, 16 - %n.mod.vf = urem i64 %wide.trip.count, %3 - %n.vec = sub i64 %wide.trip.count, %n.mod.vf - %4 = call i64 @llvm.vscale.i64() - %5 = mul i64 %4, 16 - br label %vector.body -vector.body: - %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %vec.phi = phi [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ] - %index.i = shl nuw nsw i64 %index, 1 - %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i - %a.load = load , ptr %a.ptr - %a.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %a.load) - %a.real = extractvalue { , } %a.deinterleaved, 0 - %a.imag = extractvalue { , } %a.deinterleaved, 1 - %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i - %b.load = load , ptr %b.ptr - %b.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %b.load) - %b.real = extractvalue { , } %b.deinterleaved, 0 - %b.imag = extractvalue { , } %b.deinterleaved, 1 - %a.real.ext = sext %a.real to - %a.imag.ext = sext %a.imag to - %b.real.ext = sext %b.real to - %b.imag.ext = sext %b.imag to - %real.mul = mul nsw %b.real.ext, %a.real.ext - %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) - %imag.mul = mul nsw %b.imag.ext, %a.imag.ext - %imag.mul.neg = sub zeroinitializer, %imag.mul - %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) - %index.next = add nuw i64 %index, %5 - %22 = icmp eq i64 %index.next, %n.vec - br i1 %22, label %middle.block, label %vector.body -middle.block: ; preds = %vector.body - %25 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) - %cmp.n = icmp eq i64 %wide.trip.count, %n.vec - br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph -scalar.ph: ; preds = %middle.block, %for.body.preheader - %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ] - %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ] - br label %for.body -for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body - %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ] - br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %res.0.lcssa -for.body: ; preds = %scalar.ph, %for.body - %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] - %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ] - %26 = shl nuw nsw i64 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26 - %27 = load i8, ptr %arrayidx, align 1 - %conv = sext i8 %27 to i32 - %28 = or disjoint i64 %26, 1 - %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28 - %29 = load i8, ptr %arrayidx4, align 1 - %conv5 = sext i8 %29 to i32 - %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26 - %30 = load i8, ptr %arrayidx9, align 1 - %conv10 = sext i8 %30 to i32 - %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28 - %31 = load i8, ptr %arrayidx14, align 1 - %conv15 = sext i8 %31 to i32 - %mul16 = mul nsw i32 %conv10, %conv - %add17 = add nsw i32 %mul16, %res.030 - %mul18 = mul nsw i32 %conv15, %conv5 - %sub = sub i32 %add17, %mul18 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body -} From e492fa4691ce844ba2d3b3a3e5ab05130b7b58e6 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 22 Oct 2024 14:45:40 +0100 Subject: [PATCH 05/14] Address formatting errors --- llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index edad678e4d0c0..d92dcdcd943b1 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -915,7 +915,6 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { return It->second; } - bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); if (!IsReduction && R->getType() != I->getType()) return nullptr; From 838dff4e36055761ecb884cf47e441d49a610bb8 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 8 Oct 2024 14:53:49 +0100 Subject: [PATCH 06/14] Add support for complex dot product operations --- .../llvm/CodeGen/ComplexDeinterleavingPass.h | 1 + .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 164 +++++++++++++++++ .../Target/AArch64/AArch64ISelLowering.cpp | 9 + .../AArch64/complex-deinterleaving-cdot.ll | 170 ++++++++++++++++++ 4 files changed, 344 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index a3fa219772770..4383249658e60 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass enum class ComplexDeinterleavingOperation { CAdd, CMulPartial, + CDot, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index d92dcdcd943b1..bfdae30722548 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -335,6 +335,8 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifyPartialReduction(Value *R, Value *I); + NodePtr identifyDotProduct(Value *Inst); NodePtr identifyNode(Value *R, Value *I); NodePtr identifyNode(Value *R, Value *I, bool &FromCache); @@ -900,6 +902,152 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, return submitCompositeNode(Node); } +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { + auto *Inst = cast(V); + + if(!TL->isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation::CDot, Inst->getType())) { + LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving operation CDot with the type " << *Inst->getType() << "\n"); + return nullptr; + } + + auto *RealUser = cast(*Inst->user_begin()); + + NodePtr CN = prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); + + NodePtr ANode; + + const Intrinsic::ID PartialReduceInt = Intrinsic::experimental_vector_partial_reduce_add; + + Value *AReal = nullptr; + Value *AImag = nullptr; + Value *BReal = nullptr; + Value *BImag = nullptr; + Value *Phi = nullptr; + + auto UnwrapCast = [](Value *V) -> Value* { + if(auto *CI = dyn_cast(V)) + return CI->getOperand(0); + return V; + }; + + auto PatternRot0 = + m_Intrinsic( + m_Intrinsic( + m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(AReal))), + m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); + + auto PatternRot270 = + m_Intrinsic( + m_Intrinsic( + m_Value(Phi), + m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), + m_Mul(m_Value(BImag), m_Value(AReal))); + + if(match(Inst, PatternRot0)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_0; + }else if(match(Inst, PatternRot270)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_270; + }else { + Value *A0, *A1; + // The rotations 90 and 180 share the same operation pattern, so inspect the + // order of the operands, identifying where the real and imaginary components + // of A go, to discern between the aforementioned rotations. + auto PatternRot90Rot180 = + m_Intrinsic( + m_Intrinsic( + m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(A0)) + ), + m_Mul(m_Value(BImag), m_Value(A1))); + + if(!match(Inst, PatternRot90Rot180)) + return nullptr; + + A0 = UnwrapCast(A0); + A1 = UnwrapCast(A1); + + // Test if A0 is real/A1 is imag + ANode = identifyNode(A0, A1); + if(!ANode) { + // Test if A0 is imag/A1 is real + ANode = identifyNode(A1, A0); + // Unable to identify operand components, thus unable to identify rotation + if(!ANode) + return nullptr; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_90; + AReal = A1; + AImag = A0; + } else { + AReal = A0; + AImag = A1; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_180; + } + } + + AReal = UnwrapCast(AReal); + AImag = UnwrapCast(AImag); + BReal = UnwrapCast(BReal); + BImag = UnwrapCast(BImag); + + bool WasANodeFromCache = false; + NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache); + + // In the case that a node was identified to figure out the rotation, ensure that trying to + // identify a node with AReal and AImag post-unwrap results in the same node + if(Node && ANode && !WasANodeFromCache) { + LLVM_DEBUG(dbgs() << "Identified node is different from previously identified node. Unable to confidently generate a complex operation node\n"); + return nullptr; + } + + CN->addOperand(Node); + CN->addOperand(identifyNode(BReal, BImag)); + CN->addOperand(identifyNode(Phi, RealUser)); + + return submitCompositeNode(CN); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { + if (!I->hasOneUser()) + return nullptr; + + VectorType *RealTy = dyn_cast(R->getType()); + if(!RealTy) + return nullptr; + VectorType *ImagTy = dyn_cast(I->getType()); + if(!ImagTy) + return nullptr; + + if(RealTy->isScalableTy() != ImagTy->isScalableTy()) + return nullptr; + if(RealTy->getElementType() != ImagTy->getElementType()) + return nullptr; + + // `I` is known to only have one user, so iterate over the Phi (R) users to find the common user between R and I + auto *CommonUser = *I->user_begin(); + bool CommonUserFound = false; + for (auto *User : R->users()) { + if (User == CommonUser) { + CommonUserFound = true; + break; + } + } + + if (!CommonUserFound) + return nullptr; + + auto *IInst = dyn_cast(CommonUser); + if (!IInst || IInst->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add) + return nullptr; + + if(NodePtr CN = identifyDotProduct(IInst)) + return CN; + + return nullptr; +} + ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { bool _; @@ -915,6 +1063,9 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { return It->second; } + if(NodePtr CN = identifyPartialReduction(R, I)) + return CN; + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); if (!IsReduction && R->getType() != I->getType()) return nullptr; @@ -1535,6 +1686,7 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { } void ComplexDeinterleavingGraph::identifyReductionNodes() { + dbgs() << "identifyReductionNodes\n"; SmallVector Processed(ReductionInfo.size(), false); SmallVector OperationInstruction; for (auto &P : ReductionInfo) @@ -1590,6 +1742,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { RealPHI = ReductionInfo[Real].first; ImagPHI = nullptr; PHIsFound = false; + dbgs() << "identifyNode from Phi " << *RealPHI << " / " << *Real << "\n"; auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); if (Node && PHIsFound) { LLVM_DEBUG( @@ -1978,6 +2131,17 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, Value *ReplacementNode; switch (Node->Operation) { + case ComplexDeinterleavingOperation::CDot: { + Value *Input0 = ReplaceOperandIfExist(Node, 0); + Value *Input1 = ReplaceOperandIfExist(Node, 1); + Value *Accumulator = ReplaceOperandIfExist(Node, 2); + assert(!Input1 || (Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type")); + ReplacementNode = TL->createComplexDeinterleavingIR( + Builder, Node->Operation, Node->Rotation, Input0, Input1, + Accumulator); + break; + } case ComplexDeinterleavingOperation::CAdd: case ComplexDeinterleavingOperation::CMulPartial: case ComplexDeinterleavingOperation::Symmetric: { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 869e4e48427e8..a2dcf7f851b55 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29205,6 +29205,9 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); + + if (Operation == ComplexDeinterleavingOperation::CDot) + return ScalarWidth == 32 || ScalarWidth == 64; return 8 <= ScalarWidth && ScalarWidth <= 64; } @@ -29314,6 +29317,12 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } + if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && IsScalable) { + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cdot, Accumulator->getType(), + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } + return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll new file mode 100644 index 0000000000000..6277f9a3842bb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: define i32 @cdotp( +; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]] +; CHECK-NEXT: [[A_LOAD:%.*]] = load , ptr [[A_PTR]], align 32 +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]] +; CHECK-NEXT: [[B_LOAD:%.*]] = load , ptr [[B_PTR]], align 32 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 16) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 16) +; CHECK-NEXT: [[VEC_PHI:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[VEC_PHI]], [[TMP6]], [[TMP7]], i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP13]], [[TMP8]], [[TMP9]], i32 0) +; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP20]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP22]], [[TMP21]], i64 4) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP20]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1 +; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]] +; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]] +; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]] +; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader +for.body.preheader: ; preds = %entry + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + %0 = call i64 @llvm.vscale.i64() + %1 = mul i64 %0, 16 + %min.iters.check = icmp ult i64 %wide.trip.count, %1 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph +vector.ph: ; preds = %for.body.preheader + %2 = call i64 @llvm.vscale.i64() + %3 = mul i64 %2, 16 + %n.mod.vf = urem i64 %wide.trip.count, %3 + %n.vec = sub i64 %wide.trip.count, %n.mod.vf + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 16 + br label %vector.body +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ] + %index.i = shl nuw nsw i64 %index, 1 + %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i + %a.load = load , ptr %a.ptr + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %a.load) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i + %b.load = load , ptr %b.ptr + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %b.load) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul nsw %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul nsw %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + %index.next = add nuw i64 %index, %5 + %22 = icmp eq i64 %index.next, %n.vec + br i1 %22, label %middle.block, label %vector.body +middle.block: ; preds = %vector.body + %25 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + %cmp.n = icmp eq i64 %wide.trip.count, %n.vec + br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph +scalar.ph: ; preds = %middle.block, %for.body.preheader + %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ] + %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ] + br label %for.body +for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body + %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ] + br label %for.cond.cleanup +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %res.0.lcssa +for.body: ; preds = %scalar.ph, %for.body + %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] + %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ] + %26 = shl nuw nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26 + %27 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %27 to i32 + %28 = or disjoint i64 %26, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28 + %29 = load i8, ptr %arrayidx4, align 1 + %conv5 = sext i8 %29 to i32 + %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26 + %30 = load i8, ptr %arrayidx9, align 1 + %conv10 = sext i8 %30 to i32 + %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28 + %31 = load i8, ptr %arrayidx14, align 1 + %conv15 = sext i8 %31 to i32 + %mul16 = mul nsw i32 %conv10, %conv + %add17 = add nsw i32 %mul16, %res.030 + %mul18 = mul nsw i32 %conv15, %conv5 + %sub = sub i32 %add17, %mul18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} From b2410688146531936db5f58ed2f0ebf78bf8387a Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Mon, 28 Oct 2024 15:29:18 +0000 Subject: [PATCH 07/14] Apply clang-format --- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 107 +++++++++--------- .../Target/AArch64/AArch64ISelLowering.cpp | 7 +- 2 files changed, 59 insertions(+), 55 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index bfdae30722548..098d7ead3456e 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -906,18 +906,23 @@ ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { auto *Inst = cast(V); - if(!TL->isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation::CDot, Inst->getType())) { - LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving operation CDot with the type " << *Inst->getType() << "\n"); + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CDot, Inst->getType())) { + LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " + "operation CDot with the type " + << *Inst->getType() << "\n"); return nullptr; } auto *RealUser = cast(*Inst->user_begin()); - NodePtr CN = prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); + NodePtr CN = + prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); NodePtr ANode; - const Intrinsic::ID PartialReduceInt = Intrinsic::experimental_vector_partial_reduce_add; + const Intrinsic::ID PartialReduceInt = + Intrinsic::experimental_vector_partial_reduce_add; Value *AReal = nullptr; Value *AImag = nullptr; @@ -925,56 +930,49 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { Value *BImag = nullptr; Value *Phi = nullptr; - auto UnwrapCast = [](Value *V) -> Value* { - if(auto *CI = dyn_cast(V)) + auto UnwrapCast = [](Value *V) -> Value * { + if (auto *CI = dyn_cast(V)) return CI->getOperand(0); return V; }; - - auto PatternRot0 = - m_Intrinsic( - m_Intrinsic( - m_Value(Phi), - m_Mul(m_Value(BReal), m_Value(AReal))), - m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); - - auto PatternRot270 = - m_Intrinsic( - m_Intrinsic( - m_Value(Phi), - m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), - m_Mul(m_Value(BImag), m_Value(AReal))); - - if(match(Inst, PatternRot0)) { + + auto PatternRot0 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(AReal))), + m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); + + auto PatternRot270 = m_Intrinsic( + m_Intrinsic( + m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), + m_Mul(m_Value(BImag), m_Value(AReal))); + + if (match(Inst, PatternRot0)) { CN->Rotation = ComplexDeinterleavingRotation::Rotation_0; - }else if(match(Inst, PatternRot270)) { + } else if (match(Inst, PatternRot270)) { CN->Rotation = ComplexDeinterleavingRotation::Rotation_270; - }else { + } else { Value *A0, *A1; // The rotations 90 and 180 share the same operation pattern, so inspect the - // order of the operands, identifying where the real and imaginary components - // of A go, to discern between the aforementioned rotations. - auto PatternRot90Rot180 = - m_Intrinsic( - m_Intrinsic( - m_Value(Phi), - m_Mul(m_Value(BReal), m_Value(A0)) - ), - m_Mul(m_Value(BImag), m_Value(A1))); - - if(!match(Inst, PatternRot90Rot180)) + // order of the operands, identifying where the real and imaginary + // components of A go, to discern between the aforementioned rotations. + auto PatternRot90Rot180 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(A0))), + m_Mul(m_Value(BImag), m_Value(A1))); + + if (!match(Inst, PatternRot90Rot180)) return nullptr; - + A0 = UnwrapCast(A0); A1 = UnwrapCast(A1); // Test if A0 is real/A1 is imag ANode = identifyNode(A0, A1); - if(!ANode) { + if (!ANode) { // Test if A0 is imag/A1 is real ANode = identifyNode(A1, A0); // Unable to identify operand components, thus unable to identify rotation - if(!ANode) + if (!ANode) return nullptr; CN->Rotation = ComplexDeinterleavingRotation::Rotation_90; AReal = A1; @@ -994,10 +992,14 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { bool WasANodeFromCache = false; NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache); - // In the case that a node was identified to figure out the rotation, ensure that trying to - // identify a node with AReal and AImag post-unwrap results in the same node - if(Node && ANode && !WasANodeFromCache) { - LLVM_DEBUG(dbgs() << "Identified node is different from previously identified node. Unable to confidently generate a complex operation node\n"); + // In the case that a node was identified to figure out the rotation, ensure + // that trying to identify a node with AReal and AImag post-unwrap results in + // the same node + if (Node && ANode && !WasANodeFromCache) { + LLVM_DEBUG( + dbgs() + << "Identified node is different from previously identified node. " + "Unable to confidently generate a complex operation node\n"); return nullptr; } @@ -1014,18 +1016,19 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { return nullptr; VectorType *RealTy = dyn_cast(R->getType()); - if(!RealTy) + if (!RealTy) return nullptr; VectorType *ImagTy = dyn_cast(I->getType()); - if(!ImagTy) + if (!ImagTy) return nullptr; - if(RealTy->isScalableTy() != ImagTy->isScalableTy()) + if (RealTy->isScalableTy() != ImagTy->isScalableTy()) return nullptr; - if(RealTy->getElementType() != ImagTy->getElementType()) + if (RealTy->getElementType() != ImagTy->getElementType()) return nullptr; - // `I` is known to only have one user, so iterate over the Phi (R) users to find the common user between R and I + // `I` is known to only have one user, so iterate over the Phi (R) users to + // find the common user between R and I auto *CommonUser = *I->user_begin(); bool CommonUserFound = false; for (auto *User : R->users()) { @@ -1039,10 +1042,11 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { return nullptr; auto *IInst = dyn_cast(CommonUser); - if (!IInst || IInst->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add) + if (!IInst || IInst->getIntrinsicID() != + Intrinsic::experimental_vector_partial_reduce_add) return nullptr; - if(NodePtr CN = identifyDotProduct(IInst)) + if (NodePtr CN = identifyDotProduct(IInst)) return CN; return nullptr; @@ -1063,7 +1067,7 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { return It->second; } - if(NodePtr CN = identifyPartialReduction(R, I)) + if (NodePtr CN = identifyPartialReduction(R, I)) return CN; bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); @@ -2138,8 +2142,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, assert(!Input1 || (Input0->getType() == Input1->getType() && "Node inputs need to be of the same type")); ReplacementNode = TL->createComplexDeinterleavingIR( - Builder, Node->Operation, Node->Rotation, Input0, Input1, - Accumulator); + Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); break; } case ComplexDeinterleavingOperation::CAdd: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a2dcf7f851b55..c89beb7233af6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29317,10 +29317,11 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } - if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && IsScalable) { + if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && + IsScalable) { return B.CreateIntrinsic( - Intrinsic::aarch64_sve_cdot, Accumulator->getType(), - {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + Intrinsic::aarch64_sve_cdot, Accumulator->getType(), + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); } return nullptr; From 1bf2f2ed7f5c5cfa9d851c7b848217a63a6ccbc7 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 29 Oct 2024 16:07:52 +0000 Subject: [PATCH 08/14] Change identifyNode parameter to use bool* instead of bool& Remove debug comments --- llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 098d7ead3456e..11f5813c33b0f 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -338,8 +338,7 @@ class ComplexDeinterleavingGraph { NodePtr identifyPartialReduction(Value *R, Value *I); NodePtr identifyDotProduct(Value *Inst); - NodePtr identifyNode(Value *R, Value *I); - NodePtr identifyNode(Value *R, Value *I, bool &FromCache); + NodePtr identifyNode(Value *R, Value *I, bool *FromCache = nullptr); /// Determine if a sum of complex numbers can be formed from \p RealAddends /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. @@ -990,7 +989,7 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { BImag = UnwrapCast(BImag); bool WasANodeFromCache = false; - NodePtr Node = identifyNode(AReal, AImag, WasANodeFromCache); + NodePtr Node = identifyNode(AReal, AImag, &WasANodeFromCache); // In the case that a node was identified to figure out the rotation, ensure // that trying to identify a node with AReal and AImag post-unwrap results in @@ -1053,17 +1052,12 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - bool _; - return identifyNode(R, I, _); -} - -ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) { +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool *FromCache) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); - FromCache = true; + if (FromCache != nullptr) + *FromCache = true; return It->second; } @@ -1690,7 +1684,6 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { } void ComplexDeinterleavingGraph::identifyReductionNodes() { - dbgs() << "identifyReductionNodes\n"; SmallVector Processed(ReductionInfo.size(), false); SmallVector OperationInstruction; for (auto &P : ReductionInfo) @@ -1746,7 +1739,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { RealPHI = ReductionInfo[Real].first; ImagPHI = nullptr; PHIsFound = false; - dbgs() << "identifyNode from Phi " << *RealPHI << " / " << *Real << "\n"; auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); if (Node && PHIsFound) { LLVM_DEBUG( From b19c99f23ba189ae8fc54d4c23ebf9d64c38a404 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 29 Oct 2024 16:38:11 +0000 Subject: [PATCH 09/14] Simplify complex-deinterleaving-cdot.ll --- .../AArch64/complex-deinterleaving-cdot.ll | 190 ++++-------------- 1 file changed, 39 insertions(+), 151 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll index 6277f9a3842bb..6e2a2044db81f 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -4,167 +4,55 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" -define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { +define i32 @cdotp() { ; CHECK-LABEL: define i32 @cdotp( -; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]] -; CHECK-NEXT: [[A_LOAD:%.*]] = load , ptr [[A_PTR]], align 32 -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]] -; CHECK-NEXT: [[B_LOAD:%.*]] = load , ptr [[B_PTR]], align 32 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A_LOAD]], i64 16) -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B_LOAD]], i64 16) -; CHECK-NEXT: [[VEC_PHI:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[VEC_PHI]], [[TMP6]], [[TMP7]], i32 0) -; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP13]], [[TMP8]], [[TMP9]], i32 0) -; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP20]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP22]], [[TMP21]], i64 4) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.interleave2.nxv32i8( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 16) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 16) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP0]], i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP2]], [[TMP3]], i32 0) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP7]], [[TMP4]], [[TMP5]], i32 0) +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP11]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP10]], [[TMP9]], i64 4) +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP20]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] -; CHECK: [[FOR_COND_CLEANUP]]: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1 -; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32 -; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]] -; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]] -; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]] -; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP11]]) +; CHECK-NEXT: ret i32 [[TMP12]] ; entry: - %cmp28.not = icmp ult i32 %N, 2 - br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader -for.body.preheader: ; preds = %entry - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - %0 = call i64 @llvm.vscale.i64() - %1 = mul i64 %0, 16 - %min.iters.check = icmp ult i64 %wide.trip.count, %1 - br i1 %min.iters.check, label %scalar.ph, label %vector.ph -vector.ph: ; preds = %for.body.preheader - %2 = call i64 @llvm.vscale.i64() - %3 = mul i64 %2, 16 - %n.mod.vf = urem i64 %wide.trip.count, %3 - %n.vec = sub i64 %wide.trip.count, %n.mod.vf - %4 = call i64 @llvm.vscale.i64() - %5 = mul i64 %4, 16 br label %vector.body -vector.body: - %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %vec.phi = phi [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ] - %index.i = shl nuw nsw i64 %index, 1 - %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i - %a.load = load , ptr %a.ptr - %a.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %a.load) - %a.real = extractvalue { , } %a.deinterleaved, 0 - %a.imag = extractvalue { , } %a.deinterleaved, 1 - %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i - %b.load = load , ptr %b.ptr - %b.deinterleaved = call { , } @llvm.vector.deinterleave2.nxv32i8( %b.load) - %b.real = extractvalue { , } %b.deinterleaved, 0 - %b.imag = extractvalue { , } %b.deinterleaved, 1 - %a.real.ext = sext %a.real to - %a.imag.ext = sext %a.imag to - %b.real.ext = sext %b.real to - %b.imag.ext = sext %b.imag to - %real.mul = mul nsw %b.real.ext, %a.real.ext + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.real.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to + %a.imag.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to + %b.real.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to + %b.imag.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to + %real.mul = mul %b.real.ext, %a.real.ext %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) - %imag.mul = mul nsw %b.imag.ext, %a.imag.ext + %imag.mul = mul %b.imag.ext, %a.imag.ext %imag.mul.neg = sub zeroinitializer, %imag.mul %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) - %index.next = add nuw i64 %index, %5 - %22 = icmp eq i64 %index.next, %n.vec - br i1 %22, label %middle.block, label %vector.body + br i1 true, label %middle.block, label %vector.body + middle.block: ; preds = %vector.body - %25 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) - %cmp.n = icmp eq i64 %wide.trip.count, %n.vec - br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph -scalar.ph: ; preds = %middle.block, %for.body.preheader - %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ] - %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ] - br label %for.body -for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body - %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ] - br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %res.0.lcssa -for.body: ; preds = %scalar.ph, %for.body - %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] - %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ] - %26 = shl nuw nsw i64 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26 - %27 = load i8, ptr %arrayidx, align 1 - %conv = sext i8 %27 to i32 - %28 = or disjoint i64 %26, 1 - %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28 - %29 = load i8, ptr %arrayidx4, align 1 - %conv5 = sext i8 %29 to i32 - %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26 - %30 = load i8, ptr %arrayidx9, align 1 - %conv10 = sext i8 %30 to i32 - %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28 - %31 = load i8, ptr %arrayidx14, align 1 - %conv15 = sext i8 %31 to i32 - %mul16 = mul nsw i32 %conv10, %conv - %add17 = add nsw i32 %mul16, %res.030 - %mul18 = mul nsw i32 %conv15, %conv5 - %sub = sub i32 %add17, %mul18 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 } + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.nxv4i32() #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From fc68abee7a1eaa79e8a57808cbb820ba9cbbccb3 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Fri, 8 Nov 2024 10:41:52 +0000 Subject: [PATCH 10/14] Add additional test cases for cdot --- .../AArch64/complex-deinterleaving-cdot.ll | 813 +++++++++++++++++- 1 file changed, 786 insertions(+), 27 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll index 6e2a2044db81f..72722260a2f78 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -1,41 +1,100 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE +; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" -define i32 @cdotp() { -; CHECK-LABEL: define i32 @cdotp( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.interleave2.nxv32i8( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 16) -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[TMP1]], i64 16) -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP0]], i64 4) -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP2]], [[TMP3]], i32 0) -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP7]], [[TMP4]], [[TMP5]], i32 0) -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP8]], i64 0) -; CHECK-NEXT: [[TMP11]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP10]], [[TMP9]], i64 4) -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP11]]) -; CHECK-NEXT: ret i32 [[TMP12]] +define i32 @cdotp_i8_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] ; entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] - %a.real.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to - %a.imag.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to - %b.real.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to - %b.imag.ext = sext shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) to + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to %real.mul = mul %b.real.ext, %a.real.ext %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) %imag.mul = mul %b.imag.ext, %a.imag.ext @@ -48,11 +107,711 @@ middle.block: ; preds = %vector.body ret i32 %0 } +define i32 @cdotp_i8_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i64 @cdotp_i16_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) #0 +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) #0 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.vector.reduce.add.nxv4i32() #1 +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.nxv2i64() #1 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From 97eff4d20c933df2afe5b7c533afc13b69f4a9ef Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Mon, 25 Nov 2024 10:54:41 +0000 Subject: [PATCH 11/14] Add negative test case for cdot --- .../AArch64/complex-deinterleaving-cdot.ll | 123 ++++++++++++++++-- 1 file changed, 113 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll index 72722260a2f78..c9865558d94c2 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -803,15 +803,118 @@ middle.block: ; preds = %vector.body } -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) #0 -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) #0 +define i32 @not_cdotp( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.vector.reduce.add.nxv4i32() #1 -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i64 @llvm.vector.reduce.add.nxv2i64() #1 +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) -attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() From 646ffe3b42e96a428b372ddbde0b2fe5001b968b Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 10 Dec 2024 15:03:17 +0000 Subject: [PATCH 12/14] Address comments, and refactor where certain checks are performed --- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 73 +++--- .../Target/AArch64/AArch64ISelLowering.cpp | 4 + .../AArch64/complex-deinterleaving-cdot.ll | 216 ++++++++++++++++++ 3 files changed, 255 insertions(+), 38 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 95a9768da27df..b9fbce6955e7a 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -108,8 +108,15 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { + template + std::optional findCommonBetweenCollections(IterT A, IterT B) { + auto Common = llvm::find_if(A, [B](T I){return llvm::is_contained(B, I);}); + if (Common != A.end()) + return std::make_optional(*Common); + return std::nullopt; + } -class ComplexDeinterleavingLegacyPass : public FunctionPass { + class ComplexDeinterleavingLegacyPass : public FunctionPass { public: static char ID; @@ -337,7 +344,7 @@ class ComplexDeinterleavingGraph { NodePtr identifyPartialReduction(Value *R, Value *I); NodePtr identifyDotProduct(Value *Inst); - NodePtr identifyNode(Value *R, Value *I, bool *FromCache = nullptr); + NodePtr identifyNode(Value *R, Value *I); /// Determine if a sum of complex numbers can be formed from \p RealAddends /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. @@ -902,16 +909,16 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { - auto *Inst = cast(V); if (!TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CDot, Inst->getType())) { + ComplexDeinterleavingOperation::CDot, V->getType())) { LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " "operation CDot with the type " - << *Inst->getType() << "\n"); + << *V->getType() << "\n"); return nullptr; } + auto *Inst = cast(V); auto *RealUser = cast(*Inst->user_begin()); NodePtr CN = @@ -987,13 +994,26 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { BReal = UnwrapCast(BReal); BImag = UnwrapCast(BImag); - bool WasANodeFromCache = false; - NodePtr Node = identifyNode(AReal, AImag, &WasANodeFromCache); + VectorType *VTy = cast(V->getType()); + Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2); + if (AReal->getType() != ExpectedOperandTy) + return nullptr; + if (AImag->getType() != ExpectedOperandTy) + return nullptr; + if (BReal->getType() != ExpectedOperandTy) + return nullptr; + if (BImag->getType() != ExpectedOperandTy) + return nullptr; + + if (Phi->getType() != VTy && RealUser->getType() != VTy) + return nullptr; + + NodePtr Node = identifyNode(AReal, AImag); // In the case that a node was identified to figure out the rotation, ensure // that trying to identify a node with AReal and AImag post-unwrap results in // the same node - if (Node && ANode && !WasANodeFromCache) { + if (ANode && Node != ANode) { LLVM_DEBUG( dbgs() << "Identified node is different from previously identified node. " @@ -1010,38 +1030,17 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { - if (!I->hasOneUser()) + // Partial reductions don't support non-vector types, so check these first + if (!isa(R->getType()) || !isa(I->getType())) return nullptr; - VectorType *RealTy = dyn_cast(R->getType()); - if (!RealTy) - return nullptr; - VectorType *ImagTy = dyn_cast(I->getType()); - if (!ImagTy) - return nullptr; - - if (RealTy->isScalableTy() != ImagTy->isScalableTy()) - return nullptr; - if (RealTy->getElementType() != ImagTy->getElementType()) - return nullptr; - - // `I` is known to only have one user, so iterate over the Phi (R) users to - // find the common user between R and I - auto *CommonUser = *I->user_begin(); - bool CommonUserFound = false; - for (auto *User : R->users()) { - if (User == CommonUser) { - CommonUserFound = true; - break; - } - } - - if (!CommonUserFound) + auto CommonUser = findCommonBetweenCollections(R->users(), I->users()); + if (!CommonUser) return nullptr; - auto *IInst = dyn_cast(CommonUser); + auto *IInst = dyn_cast(*CommonUser); if (!IInst || IInst->getIntrinsicID() != - Intrinsic::experimental_vector_partial_reduce_add) + Intrinsic::experimental_vector_partial_reduce_add) return nullptr; if (NodePtr CN = identifyDotProduct(IInst)) @@ -1051,12 +1050,10 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool *FromCache) { +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); - if (FromCache != nullptr) - *FromCache = true; return It->second; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c66ff590f2aa9..423395a994587 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29414,6 +29414,10 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( return 8 <= ScalarWidth && ScalarWidth <= 64; } + // CDot is not supported outside of scalable/sve scopes + if (Operation == ComplexDeinterleavingOperation::CDot) + return false; + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll index c9865558d94c2..b02e5972f54e5 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -913,8 +913,224 @@ middle.block: ; preds = %vector.body ret i32 %0 } +define i16 @invalid_type( %a, %b) { +; CHECK-SVE2-LABEL: define i16 @invalid_type( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i16 [[TMP0]] +; +; CHECK-SVE-LABEL: define i16 @invalid_type( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i16 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i16 @invalid_type( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i16 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i16 @llvm.vector.reduce.add.nxv8i16( %partial.reduce.sub) + ret i16 %0 +} + +define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a) + %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b) + %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0 + %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1 + %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0 + %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1 + %a.real.ext = sext <16 x i8> %a.real to <16 x i32> + %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32> + %b.real.ext = sext <16 x i8> %b.real to <16 x i32> + %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32> + %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext + %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul) + %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul + %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub) + ret i32 %0 +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(, ) declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) +declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.nxv4i32() declare i64 @llvm.vector.reduce.add.nxv2i64() From b3550a588a6b2087c27bdc89f0c849951cfb341f Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 10 Dec 2024 15:40:52 +0000 Subject: [PATCH 13/14] Fix formatting --- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index b9fbce6955e7a..3111354addacd 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -108,15 +108,15 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { - template - std::optional findCommonBetweenCollections(IterT A, IterT B) { - auto Common = llvm::find_if(A, [B](T I){return llvm::is_contained(B, I);}); - if (Common != A.end()) - return std::make_optional(*Common); - return std::nullopt; - } +template +std::optional findCommonBetweenCollections(IterT A, IterT B) { + auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); }); + if (Common != A.end()) + return std::make_optional(*Common); + return std::nullopt; +} - class ComplexDeinterleavingLegacyPass : public FunctionPass { +class ComplexDeinterleavingLegacyPass : public FunctionPass { public: static char ID; @@ -207,7 +207,7 @@ struct ComplexDeinterleavingCompositeNode { } } - bool AreOperandsValid() { return OperandsValid; } + bool areOperandsValid() { return OperandsValid; } }; class ComplexDeinterleavingGraph { @@ -1034,13 +1034,14 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { if (!isa(R->getType()) || !isa(I->getType())) return nullptr; - auto CommonUser = findCommonBetweenCollections(R->users(), I->users()); + auto CommonUser = + findCommonBetweenCollections(R->users(), I->users()); if (!CommonUser) return nullptr; auto *IInst = dyn_cast(*CommonUser); if (!IInst || IInst->getIntrinsicID() != - Intrinsic::experimental_vector_partial_reduce_add) + Intrinsic::experimental_vector_partial_reduce_add) return nullptr; if (NodePtr CN = identifyDotProduct(IInst)) @@ -1756,7 +1757,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { bool ComplexDeinterleavingGraph::checkNodes() { for (NodePtr N : CompositeNodes) { - if (!N->AreOperandsValid()) + if (!N->areOperandsValid()) return false; } From 0329be676d34474594cc4e1002d1ad93be42a585 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 17 Dec 2024 17:00:58 +0000 Subject: [PATCH 14/14] Update test name --- llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll index b02e5972f54e5..11cf4c31936d8 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -1019,8 +1019,8 @@ middle.block: ; preds = %vector.body ret i16 %0 } -define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { -; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( ; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { ; CHECK-SVE2-NEXT: [[ENTRY:.*]]: ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1046,7 +1046,7 @@ define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) ; CHECK-SVE2-NEXT: ret i32 [[TMP0]] ; -; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( ; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { ; CHECK-SVE-NEXT: [[ENTRY:.*]]: ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1072,7 +1072,7 @@ define i32 @cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) ; CHECK-SVE-NEXT: ret i32 [[TMP0]] ; -; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0_fixed_length( +; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( ; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) { ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]