diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9d9886f4920a2..d2841d691cce2 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3155,8 +3155,11 @@ class TargetLoweringBase { /// /// \p DI is the deinterleave intrinsic. /// \p LI is the accompanying load instruction - virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, - LoadInst *LI) const { + /// \p DeadInsts is a reference to a vector that keeps track of dead + /// instruction during transformations. + virtual bool lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, LoadInst *LI, + SmallVectorImpl &DeadInsts) const { return false; } @@ -3166,8 +3169,11 @@ class TargetLoweringBase { /// /// \p II is the interleave intrinsic. /// \p SI is the accompanying store instruction - virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, - StoreInst *SI) const { + /// \p DeadInsts is a reference to a vector that keeps track of dead + /// instruction during transformations. + virtual bool lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, StoreInst *SI, + SmallVectorImpl &DeadInsts) const { return false; } diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index d9e27e087e705..65ea948f16a32 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2924,6 +2924,17 @@ inline VScaleVal_match m_VScale() { return VScaleVal_match(); } +template +inline typename m_Intrinsic_Ty::Ty +m_Interleave2(const Opnd0 &Op0, const Opnd1 &Op1) { + return m_Intrinsic(Op0, Op1); +} + +template +inline typename m_Intrinsic_Ty::Ty m_Deinterleave2(const Opnd &Op) { + return m_Intrinsic(Op); +} + template struct LogicalOp_match { LHS L; diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 8c9065aec7faa..ef11713892a53 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -489,7 +489,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeadInsts)) return false; // We now have a target-specific load, so delete the old one. @@ -510,13 +510,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + SmallVector InterleaveDeadInsts; // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(II, SI)) + if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts)) return false; // We now have a target-specific store, so delete the old one. DeadInsts.push_back(SI); DeadInsts.push_back(II); + DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(), + InterleaveDeadInsts.end()); return true; } @@ -537,7 +540,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { // with a factor of 2. if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2) Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); - if (II->getIntrinsicID() == Intrinsic::vector_interleave2) + else if (II->getIntrinsicID() == Intrinsic::vector_interleave2) Changed |= lowerInterleaveIntrinsic(II, DeadInsts); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bf205b1706a6c..24d53522973a5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16906,17 +16906,148 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } +bool getDeinterleave2Values( + Value *DI, SmallVectorImpl &DeinterleavedValues, + SmallVectorImpl &DeInterleaveDeadInsts) { + if (!DI->hasNUses(2)) + return false; + auto *Extr1 = dyn_cast(*(DI->user_begin())); + auto *Extr2 = dyn_cast(*(++DI->user_begin())); + if (!Extr1 || !Extr2) + return false; + + DeinterleavedValues.resize(2); + // Place the values into the vector in the order of extraction: + DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1; + DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2; + if (!DeinterleavedValues[0] || !DeinterleavedValues[1]) + return false; + + // Make sure that the extracted values match the deinterleave tree pattern + if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) || + !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) { + LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n"); + return false; + } + // DeinterleavedValues will be replace by output of ld2 + DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(), + DeinterleavedValues.begin(), + DeinterleavedValues.end()); + return true; +} + +/* +DeinterleaveIntrinsic tree: + [DI] + / \ + [Extr<0>] [Extr<1>] + | | + [DI] [DI] + / \ / \ + [Extr<0>][Extr<1>] [Extr<0>][Extr<1>] + | | | | +roots: A C B D +roots in correct order of DI4 will be: A B C D. +Returns true if `DI` is the top of an IR tree that represents a theoretical +vector.deinterleave4 intrinsic. When true is returned, \p `DeinterleavedValues` +vector is populated with the results such an intrinsic would return: (i.e. {A, +B, C, D } = vector.deinterleave4(...)) +*/ +bool getDeinterleave4Values( + Value *DI, SmallVectorImpl &DeinterleavedValues, + SmallVectorImpl &DeInterleaveDeadInsts) { + if (!DI->hasNUses(2)) + return false; + auto *Extr1 = dyn_cast(*(DI->user_begin())); + auto *Extr2 = dyn_cast(*(++DI->user_begin())); + if (!Extr1 || !Extr2) + return false; + + if (!Extr1->hasOneUse() || !Extr2->hasOneUse()) + return false; + auto *DI1 = *(Extr1->user_begin()); + auto *DI2 = *(Extr2->user_begin()); + + if (!DI1->hasNUses(2) || !DI2->hasNUses(2)) + return false; + // Leaf nodes of the deinterleave tree: + auto *A = dyn_cast(*(DI1->user_begin())); + auto *C = dyn_cast(*(++DI1->user_begin())); + auto *B = dyn_cast(*(DI2->user_begin())); + auto *D = dyn_cast(*(++DI2->user_begin())); + // Make sure that the A,B,C and D are ExtractValue instructions before getting + // the extract index + if (!A || !B || !C || !D) + return false; + + DeinterleavedValues.resize(4); + // Place the values into the vector in the order of deinterleave4: + DeinterleavedValues[0x3 & + ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A; + DeinterleavedValues[0x3 & + ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B; + DeinterleavedValues[0x3 & + ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C; + DeinterleavedValues[0x3 & + ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D; + if (!DeinterleavedValues[0] || !DeinterleavedValues[1] || + !DeinterleavedValues[2] || !DeinterleavedValues[3]) + return false; + + // Make sure that A,B,C,D match the deinterleave tree pattern + if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2( + m_ExtractValue<0>(m_Specific(DI))))) || + !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2( + m_ExtractValue<1>(m_Specific(DI))))) || + !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2( + m_ExtractValue<0>(m_Specific(DI))))) || + !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2( + m_ExtractValue<1>(m_Specific(DI)))))) { + LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n"); + return false; + } + + // These Values will not be used anymore, + // DI4 will be created instead of nested DI1 and DI2 + DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(), + DeinterleavedValues.begin(), + DeinterleavedValues.end()); + DeInterleaveDeadInsts.push_back(cast(DI1)); + DeInterleaveDeadInsts.push_back(cast(Extr1)); + DeInterleaveDeadInsts.push_back(cast(DI2)); + DeInterleaveDeadInsts.push_back(cast(Extr2)); + + return true; +} + +bool getDeinterleavedValues( + Value *DI, SmallVectorImpl &DeinterleavedValues, + SmallVectorImpl &DeInterleaveDeadInsts) { + if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts)) + return true; + return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts); +} + bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI) const { + IntrinsicInst *DI, LoadInst *LI, + SmallVectorImpl &DeadInsts) const { // Only deinterleave2 supported at present. if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2) return false; - // Only a factor of 2 supported at present. - const unsigned Factor = 2; + SmallVector DeinterleavedValues; + SmallVector DeInterleaveDeadInsts; - VectorType *VTy = cast(DI->getType()->getContainedType(0)); - const DataLayout &DL = DI->getDataLayout(); + if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) { + LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); + return false; + } + unsigned Factor = DeinterleavedValues.size(); + assert((Factor == 2 || Factor == 4) && + "Currently supported Factor is 2 or 4 only"); + VectorType *VTy = cast(DeinterleavedValues[0]->getType()); + + const DataLayout &DL = DI->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; @@ -16927,7 +17058,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( return false; unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); - VectorType *LdTy = VectorType::get(VTy->getElementType(), VTy->getElementCount().divideCoefficientBy(NumLoads)); @@ -16937,18 +17067,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( UseScalable, LdTy, PtrTy); IRBuilder<> Builder(LI); - Value *Pred = nullptr; if (UseScalable) Pred = Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); - Value *Result; if (NumLoads > 1) { - Value *Left = PoisonValue::get(VTy); - Value *Right = PoisonValue::get(VTy); - + // Create multiple legal small ldN. + SmallVector ExtractedLdValues(Factor, PoisonValue::get(VTy)); for (unsigned I = 0; I < NumLoads; ++I) { Value *Offset = Builder.getInt64(I * Factor); @@ -16958,40 +17085,96 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); else LdN = Builder.CreateCall(LdNFunc, Address, "ldN"); - Value *Idx = Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue()); - Left = Builder.CreateInsertVector( - VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx); - Right = Builder.CreateInsertVector( - VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx); + for (unsigned J = 0; J < Factor; ++J) { + ExtractedLdValues[J] = Builder.CreateInsertVector( + VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx); + } + LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } - - Result = PoisonValue::get(DI->getType()); - Result = Builder.CreateInsertValue(Result, Left, 0); - Result = Builder.CreateInsertValue(Result, Right, 1); + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + for (unsigned J = 0; J < Factor; ++J) + DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); } else { + Value *Result; if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + for (unsigned I = 0; I < Factor; I++) { + Value *NewExtract = Builder.CreateExtractValue(Result, I); + DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); + } } - - DI->replaceAllUsesWith(Result); + DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(), + DeInterleaveDeadInsts.end()); return true; } +/* +InterleaveIntrinsic tree. + A C B D + \ / \ / + [II] [II] + \ / + [II] + +values in correct order of interleave4: A B C D. +Returns true if `II` is the root of an IR tree that represents a theoretical +vector.interleave4 intrinsic. When true is returned, \p `InterleavedValues` +vector is populated with the inputs such an intrinsic would take: (i.e. +vector.interleave4(A, B, C, D)). +*/ +bool getValuesToInterleave( + Value *II, SmallVectorImpl &InterleavedValues, + SmallVectorImpl &InterleaveDeadInsts) { + Value *A, *B, *C, *D; + // Try to match interleave of Factor 4 + if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)), + m_Interleave2(m_Value(B), m_Value(D))))) { + InterleavedValues.push_back(A); + InterleavedValues.push_back(B); + InterleavedValues.push_back(C); + InterleavedValues.push_back(D); + // intermediate II will not be needed anymore + InterleaveDeadInsts.push_back( + cast(cast(II)->getOperand(0))); + InterleaveDeadInsts.push_back( + cast(cast(II)->getOperand(1))); + return true; + } + + // Try to match interleave of Factor 2 + if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) { + InterleavedValues.push_back(A); + InterleavedValues.push_back(B); + return true; + } + + return false; +} + bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI) const { + IntrinsicInst *II, StoreInst *SI, + SmallVectorImpl &DeadInsts) const { // Only interleave2 supported at present. if (II->getIntrinsicID() != Intrinsic::vector_interleave2) return false; - // Only a factor of 2 supported at present. - const unsigned Factor = 2; + SmallVector InterleavedValues; + SmallVector InterleaveDeadInsts; + if (!getValuesToInterleave(II, InterleavedValues, InterleaveDeadInsts)) { + LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); + return false; + } + unsigned Factor = InterleavedValues.size(); + assert((Factor == 2 || Factor == 4) && + "Currently supported Factor is 2 or 4 only"); + VectorType *VTy = cast(InterleavedValues[0]->getType()); + const DataLayout &DL = II->getModule()->getDataLayout(); - VectorType *VTy = cast(II->getOperand(0)->getType()); - const DataLayout &DL = II->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; @@ -17020,27 +17203,28 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( Pred = Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue()); - Value *L = II->getOperand(0); - Value *R = II->getOperand(1); - + auto ExtractedValues = InterleavedValues; + if (UseScalable) + InterleavedValues.push_back(Pred); + InterleavedValues.push_back(BaseAddr); for (unsigned I = 0; I < NumStores; ++I) { Value *Address = BaseAddr; if (NumStores > 1) { Value *Offset = Builder.getInt64(I * Factor); Address = Builder.CreateGEP(StTy, BaseAddr, {Offset}); - Value *Idx = Builder.getInt64(I * StTy->getElementCount().getKnownMinValue()); - L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx); - R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); + for (unsigned J = 0; J < Factor; J++) { + InterleavedValues[J] = + Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx); + } + // update the address + InterleavedValues[InterleavedValues.size() - 1] = Address; } - - if (UseScalable) - Builder.CreateCall(StNFunc, {L, R, Pred, Address}); - else - Builder.CreateCall(StNFunc, {L, R, Address}); + Builder.CreateCall(StNFunc, InterleavedValues); } - + DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(), + InterleaveDeadInsts.end()); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index fcdd47541be82..b2c511b4917bc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -703,11 +703,13 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, - LoadInst *LI) const override; + bool lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, LoadInst *LI, + SmallVectorImpl &DeadInsts) const override; - bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, - StoreInst *SI) const override; + bool lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, StoreInst *SI, + SmallVectorImpl &DeadInsts) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e938454b8e642..b55018bd9b2ef 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21775,8 +21775,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, - LoadInst *LI) const { +bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, LoadInst *LI, + SmallVectorImpl &DeadInsts) const { assert(LI->isSimple()); IRBuilder<> Builder(LI); @@ -21825,8 +21826,9 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, return true; } -bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II, - StoreInst *SI) const { +bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, StoreInst *SI, + SmallVectorImpl &DeadInsts) const { assert(SI->isSimple()); IRBuilder<> Builder(SI); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 0b0ad9229f0b3..57c4d308e395d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -876,11 +876,13 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II, - LoadInst *LI) const override; + bool lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *II, LoadInst *LI, + SmallVectorImpl &DeadInsts) const override; - bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, - StoreInst *SI) const override; + bool lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, StoreInst *SI, + SmallVectorImpl &DeadInsts) const override; bool supportKCFIBundles() const override { return true; } diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll index 24d624c221f46..09e2c53465cd7 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll @@ -6,123 +6,165 @@ target triple = "aarch64-linux-gnu" -define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) { -; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2 +define void @deinterleave_i8_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_i8_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <16 x i8>, <16 x i8> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <32 x i8>, ptr %ptr, align 1 %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load) - ret { <16 x i8>, <16 x i8> } %deinterleave + %extract1 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 0 + %extract2 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 1 + ret void } -define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) { -; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2 +define void @deinterleave_i16_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_i16_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <8 x i16>, <8 x i16> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <16 x i16>, ptr %ptr, align 2 %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load) - ret { <8 x i16>, <8 x i16> } %deinterleave + %extract1 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 0 + %extract2 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 1 + ret void } -define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) { -; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2 +define void @deinterleave_8xi32_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_8xi32_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <4 x i32>, <4 x i32> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <8 x i32>, ptr %ptr, align 4 %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load) - ret { <4 x i32>, <4 x i32> } %deinterleave + %extract1 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 0 + %extract2 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 1 + ret void } -define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) { -; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2 +define void @deinterleave_i64_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_i64_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <2 x i64>, <2 x i64> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <4 x i64>, ptr %ptr, align 8 %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load) - ret { <2 x i64>, <2 x i64> } %deinterleave + %extract1 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 0 + %extract2 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 1 + ret void } -define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) { -; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2 +define void @deinterleave_float_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_float_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <4 x float>, <4 x float> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_float_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <8 x float>, ptr %ptr, align 4 %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load) - ret { <4 x float>, <4 x float> } %deinterleave + %extract1 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 0 + %extract2 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 1 + ret void } -define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) { -; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2 +define void @deinterleave_double_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_double_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <2 x double>, <2 x double> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_double_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <4 x double>, ptr %ptr, align 8 %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load) - ret { <2 x double>, <2 x double> } %deinterleave + %extract1 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 0 + %extract2 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 1 + ret void } -define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) { -; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2 +define void @deinterleave_ptr_factor2(ptr %ptr) { +; NEON-LABEL: define void @deinterleave_ptr_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]]) -; NEON-NEXT: ret { <2 x ptr>, <2 x ptr> } [[LDN]] +; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0 +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1 +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <4 x ptr>, ptr %ptr, align 8 %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load) - ret { <2 x ptr>, <2 x ptr> } %deinterleave + %extract1 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 0 + %extract2 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 1 + ret void } define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) { @@ -244,8 +286,8 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) { ret void } -define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 { -; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2 +define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 { +; NEON-LABEL: define void @deinterleave_wide_i16_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0 ; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]]) @@ -259,19 +301,21 @@ define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 { ; NEON-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8) ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1 ; NEON-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8) -; NEON-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0 -; NEON-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1 -; NEON-NEXT: ret { <16 x i16>, <16 x i16> } [[TMP12]] +; NEON-NEXT: ret void ; -; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2 +; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2 ; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]]) -; SVE-FIXED-NEXT: ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]] +; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 0 +; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 1 +; SVE-FIXED-NEXT: ret void ; %load = load <32 x i16>, ptr %ptr, align 2 %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load) - ret { <16 x i16>, <16 x i16> } %deinterleave + %extract1 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 0 + %extract2 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 1 + ret void } define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) { diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll index 2a05718cc4161..e5b56eb54f927 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -4,81 +4,109 @@ target triple = "aarch64-linux-gnu" -define { , } @deinterleave_nxi8_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxi8_factor2 +define void @deinterleave_nxi8_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi8_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 1 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv32i8( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 1 + %extract2 = extractvalue { , } %deinterleave, 0 + ret void } -define { , } @deinterleave_nxi16_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxi16_factor2 +define void @deinterleave_nxi16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi16_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 2 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv16i16( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nx8xi32_factor2 +define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_nxi64_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxi64_factor2 +define void @deinterleave_nxi64_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi64_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv4i64( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_nxfloat_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxfloat_factor2 +define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxfloat_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv8f32( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_nxdouble_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxdouble_factor2 +define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxdouble_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv4f64( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_nxptr_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_nxptr_factor2 +define void @deinterleave_nxptr_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxptr_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2p0( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) -; CHECK-NEXT: ret { , } [[LDN]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv4p0( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } define void @interleave_nxi8_factor2(ptr %ptr, %l, %r) #0 { @@ -160,8 +188,8 @@ define void @interleave_nxptr_factor2(ptr %ptr, %l, , } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_wide_nxi32_factor2 +define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_nxi32_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) @@ -187,17 +215,17 @@ define { , } @deinterleave_wide_nxi32_fac ; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP13]], [[TMP17]], i64 12) ; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[LDN3]], 1 ; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP15]], [[TMP19]], i64 12) -; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { , } poison, [[TMP18]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { , } [[TMP21]], [[TMP20]], 1 -; CHECK-NEXT: ret { , } [[TMP22]] +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv32i32( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } -define { , } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { -; CHECK-LABEL: define { , } @deinterleave_wide_nxdouble_factor2 +define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_nxdouble_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) @@ -211,13 +239,13 @@ define { , } @deinterleave_wide_nxdou ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) -; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 -; CHECK-NEXT: ret { , } [[TMP12]] +; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 %deinterleave = tail call { , } @llvm.vector.deinterleave2.nxv8f64( %load) - ret { , } %deinterleave + %extract1 = extractvalue { , } %deinterleave, 0 + %extract2 = extractvalue { , } %deinterleave, 1 + ret void } define void @interleave_wide_nxdouble_factor2(ptr %ptr, %l, %r) #0 { diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll new file mode 100644 index 0000000000000..06ecff6729881 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s + + +define void @deinterleave4(ptr %src) { +; CHECK-LABEL: define void @deinterleave4 +; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[SRC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[SUM:%.*]] = add [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[SUB:%.*]] = sub [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret void +; + + %load = load , ptr %src, align 4 + %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) + %3 = extractvalue { , } %deinterleave_src, 0 + %4 = extractvalue { , } %deinterleave_src, 1 + %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) + %5 = extractvalue { , } %deinterleave_half1, 0 + %6 = extractvalue { , } %deinterleave_half1, 1 + %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) + %7 = extractvalue { , } %deinterleave_half2, 0 + %8 = extractvalue { , } %deinterleave_half2, 1 + %sum = add %5, %7 + %sub = sub %6, %8 + ret void +} + +define void @wide_deinterleave4(ptr %src) { +; CHECK-LABEL: define void @wide_deinterleave4 +; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[SRC]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr , ptr [[SRC]], i64 4 +; CHECK-NEXT: [[LDN1:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP11]], i64 4) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP5]], [[TMP13]], i64 4) +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[LDN1]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP7]], [[TMP15]], i64 4) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN1]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP17]], i64 4) +; CHECK-NEXT: [[SUM:%.*]] = add [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[SUB:%.*]] = sub [[TMP16]], [[TMP18]] +; CHECK-NEXT: ret void +; + %load = load , ptr %src, align 4 + %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv32i32( %load) + %3 = extractvalue { , } %deinterleave_src, 0 + %4 = extractvalue { , } %deinterleave_src, 1 + %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %3) + %5 = extractvalue { , } %deinterleave_half1, 0 + %6 = extractvalue { , } %deinterleave_half1, 1 + %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %4) + %7 = extractvalue { , } %deinterleave_half2, 0 + %8 = extractvalue { , } %deinterleave_half2, 1 + %sum = add %5, %7 + %sub = sub %6, %8 + ret void +} + +define void @mix_deinterleave4_deinterleave2(ptr %src) { +; CHECK-LABEL: define void @mix_deinterleave4_deinterleave2 +; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[SRC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[SRC]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: ret void +; + + %load = load , ptr %src, align 4 + %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) + %3 = extractvalue { , } %deinterleave_src, 0 + %4 = extractvalue { , } %deinterleave_src, 1 + %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) + %5 = extractvalue { , } %deinterleave_half1, 0 + %6 = extractvalue { , } %deinterleave_half1, 1 + %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) + %7 = extractvalue { , } %deinterleave_half2, 0 + %8 = extractvalue { , } %deinterleave_half2, 1 + + %load2 = load , ptr %src, align 4 + %deinterleave_src2 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load2) + %ld2_1 = extractvalue { , } %deinterleave_src2, 0 + %ld2_2 = extractvalue { , } %deinterleave_src2, 1 + ret void +} + +define void @negative_deinterleave4_test(ptr %src) { +; CHECK-LABEL: define void @negative_deinterleave4_test +; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[SRC]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[SRC]], i64 2 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP7]], i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP5]], [[TMP9]], i64 4) +; CHECK-NEXT: [[DEINTERLEAVE_HALF1:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[DEINTERLEAVE_HALF1]], 0 +; CHECK-NEXT: [[DEINTERLEAVE_HALF2:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[DEINTERLEAVE_HALF2]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %src, align 4 + %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) + %3 = extractvalue { , } %deinterleave_src, 0 + %4 = extractvalue { , } %deinterleave_src, 1 + %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) + %5 = extractvalue { , } %deinterleave_half1, 0 + %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) + %6 = extractvalue { , } %deinterleave_half2, 1 + + ret void +} diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll new file mode 100644 index 0000000000000..ba9bff093678c --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s + + +define void @interleave4(ptr %dst, %a, %b, %c, %d) { +; CHECK-LABEL: define void @interleave4 +; CHECK-SAME: (ptr [[DST:%.*]], [[A:%.*]], [[B:%.*]], [[C:%.*]], [[D:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[A]], [[B]], [[C]], [[D]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[DST]]) +; CHECK-NEXT: ret void +; + %interleaved.half1 = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) + %interleaved.half2 = tail call @llvm.vector.interleave2.nxv8i32( %b, %d) + %interleaved.vec = tail call @llvm.vector.interleave2.nxv16i32( %interleaved.half1, %interleaved.half2) + store %interleaved.vec, ptr %dst, align 4 + ret void +} + +define void @wide_interleave4(ptr %dst, %a, %b, %c, %d) { +; CHECK-LABEL: define void @wide_interleave4 +; CHECK-SAME: (ptr [[DST:%.*]], [[A:%.*]], [[B:%.*]], [[C:%.*]], [[D:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[DST]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[A]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[B]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[C]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[D]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[DST]], i64 4 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[A]], i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[B]], i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[C]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[D]], i64 4) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP7]], [[TMP8]], [[TMP9]], [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: ret void +; + %interleaved.half1 = tail call @llvm.vector.interleave2.nxv16i32( %a, %c) + %interleaved.half2 = tail call @llvm.vector.interleave2.nxv16i32( %b, %d) + %interleaved.vec = tail call @llvm.vector.interleave2.nxv32i32( %interleaved.half1, %interleaved.half2) + store %interleaved.vec, ptr %dst, align 4 + ret void +} + +define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, %a, %b, %c, %d) { +; CHECK-LABEL: define void @mix_interleave4_interleave2 +; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], [[A:%.*]], [[B:%.*]], [[C:%.*]], [[D:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[A]], [[B]], [[C]], [[D]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[DST1]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32( [[A]], [[C]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[DST2]]) +; CHECK-NEXT: ret void +; + %interleaved.half1 = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) + %interleaved.half2 = tail call @llvm.vector.interleave2.nxv8i32( %b, %d) + %interleaved.vec = tail call @llvm.vector.interleave2.nxv16i32( %interleaved.half1, %interleaved.half2) + store %interleaved.vec, ptr %dst1, align 4 + + %interleaved = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) + store %interleaved, ptr %dst2, align 4 + ret void +} diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll index 73f26814f3a4b..8821255a86b2f 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll @@ -1,36 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -interleaved-access -S | FileCheck %s ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s target triple = "aarch64-linux-gnu" define void @load_factor2(ptr %ptr) #0 { -; CHECK-LABEL: @load_factor2( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[EXT1:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16( [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[EXT2:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16( [[TMP3]], i64 0) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_factor2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16( [[TMP4]], i64 0) +; CHECK-NEXT: ret void +; %interleaved.vec = load <32 x i16>, ptr %ptr, align 4 %v0 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> + i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> %v1 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> + i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> ret void } define void @load_factor3(ptr %ptr) #0 { -; CHECK-LABEL: @load_factor3( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4i32( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 -; CHECK-NEXT: [[EXT1:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 1 -; CHECK-NEXT: [[EXT2:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 0 -; CHECK-NEXT: [[EXT3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP4]], i64 0) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_factor3( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4i32( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32( [[TMP6]], i64 0) +; CHECK-NEXT: ret void +; %interleaved.vec = load <24 x i32>, ptr %ptr, align 4 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> @@ -39,18 +44,20 @@ define void @load_factor3(ptr %ptr) #0 { } define void @load_factor4(ptr %ptr) #0 { -; CHECK-LABEL: @load_factor4( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_factor4( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP8]], i64 0) +; CHECK-NEXT: ret void +; %interleaved.vec = load <16 x i64>, ptr %ptr, align 4 %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> @@ -60,54 +67,64 @@ define void @load_factor4(ptr %ptr) #0 { } define void @store_factor2(ptr %ptr, <16 x i16> %v0, <16 x i16> %v1) #0 { -; CHECK-LABEL: @store_factor2( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP2]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[INS1]], [[INS2]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_factor2( +; CHECK-SAME: ptr [[PTR:%.*]], <16 x i16> [[V0:%.*]], <16 x i16> [[V1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP4]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %interleaved.vec = shufflevector <16 x i16> %v0, <16 x i16> %v1, <32 x i32> + i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> store <32 x i16> %interleaved.vec, ptr %ptr, align 4 ret void } define void @store_factor3(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) #0 { -; CHECK-LABEL: @store_factor3( -; CHECK: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> -; CHECK-NEXT: [[INS3:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32( [[INS1]], [[INS2]], [[INS3]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_factor3( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <16 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP6]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> + i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> store <24 x i32> %interleaved.vec, ptr %ptr, align 4 ret void } define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) #0 { -; CHECK-LABEL: @store_factor4( -; CHECK: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> -; CHECK-NEXT: [[INS3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> -; CHECK-NEXT: [[INS4:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[INS4]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_factor4( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <8 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x i64> [[V2]], <4 x i64> [[V3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP8]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP9]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> @@ -116,16 +133,18 @@ define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2 } define void @load_ptrvec_factor2(ptr %ptr) #0 { -; CHECK-LABEL: @load_ptrvec_factor2( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) -; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr> -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr> -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_ptrvec_factor2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr> +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr> +; CHECK-NEXT: ret void +; %interleaved.vec = load <8 x ptr>, ptr %ptr, align 4 %v0 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> %v1 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> @@ -133,19 +152,21 @@ define void @load_ptrvec_factor2(ptr %ptr) #0 { } define void @load_ptrvec_factor3(ptr %ptr) #0 { -; CHECK-LABEL: @load_ptrvec_factor3( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv2i64( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 -; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) -; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr> -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 1 -; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr> -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 0 -; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) -; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr> -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_ptrvec_factor3( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr> +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr> +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr> +; CHECK-NEXT: ret void +; %interleaved.vec = load <12 x ptr>, ptr %ptr, align 4 %v0 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> %v1 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> @@ -154,22 +175,24 @@ define void @load_ptrvec_factor3(ptr %ptr) #0 { } define void @load_ptrvec_factor4(ptr %ptr) #0 { -; CHECK-LABEL: @load_ptrvec_factor4( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) -; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr> -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr> -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) -; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr> -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) -; CHECK-NEXT: [[TOP4:%.*]] = inttoptr <4 x i64> [[EXT4]] to <4 x ptr> -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_ptrvec_factor4( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr> +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr> +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr> +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr <4 x i64> [[TMP12]] to <4 x ptr> +; CHECK-NEXT: ret void +; %interleaved.vec = load <16 x ptr>, ptr %ptr, align 4 %v0 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> %v1 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> @@ -179,34 +202,40 @@ define void @load_ptrvec_factor4(ptr %ptr) #0 { } define void @store_ptrvec_factor2(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1) #0 { -; CHECK-LABEL: @store_ptrvec_factor2( -; CHECK-NEXT: [[TOI1:%.*]] = ptrtoint <4 x ptr> %v0 to <4 x i64> -; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <4 x ptr> %v1 to <4 x i64> -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS1]], [[INS2]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_ptrvec_factor2( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <4 x ptr> [[V0]] to <4 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <4 x ptr> [[V1]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP5]], [[TMP7]], [[TMP3]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %interleaved.vec = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> store <8 x ptr> %interleaved.vec, ptr %ptr, align 4 ret void } define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2) #0 { -; CHECK-LABEL: @store_ptrvec_factor3( -; CHECK: [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64> -; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64> -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_ptrvec_factor3( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP8]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP5]], [[TMP7]], [[TMP9]], [[TMP3]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> poison, <8 x i32> %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <12 x i32> @@ -215,45 +244,51 @@ define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x p } define void @store_ptrvec_factor4(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2, <4 x ptr> %v3) #0 { -; CHECK-LABEL: @store_ptrvec_factor4( -; CHECK: [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64> -; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64> -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> -; CHECK-NEXT: [[INS4:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[INS4]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_ptrvec_factor4( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]], <4 x ptr> [[V3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> [[V3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP10]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP5]], [[TMP7]], [[TMP9]], [[TMP11]], [[TMP3]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> %v3, <8 x i32> %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <16 x i32> + i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> store <16 x ptr> %interleaved.vec, ptr %ptr, align 4 ret void } define void @load_factor2_wide(ptr %ptr) #0 { -; CHECK-LABEL: @load_factor2_wide( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr %ptr, i32 8 -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], ptr [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP6]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[EXT1]], <4 x i64> [[EXT3]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[EXT2]], <4 x i64> [[EXT4]], <8 x i32> -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @load_factor2_wide( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[PTR]], i32 8 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[TMP1]], ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP7]], i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP9]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP10]], <8 x i32> +; CHECK-NEXT: ret void +; %interleaved.vec = load <16 x i64>, ptr %ptr, align 4 %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> @@ -261,20 +296,22 @@ define void @load_factor2_wide(ptr %ptr) #0 { } define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 { -; CHECK-LABEL: @store_factor2_wide( -; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> -; CHECK-NEXT: [[INS1:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> -; CHECK-NEXT: [[INS2:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS1]], [[INS2]], [[PTRUE]], ptr %ptr) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> -; CHECK-NEXT: [[INS3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> -; CHECK-NEXT: [[INS4:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP5]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr %ptr, i32 8 -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS3]], [[INS4]], [[PTRUE]], ptr [[TMP6]]) -; CHECK-NEXT: ret void +; CHECK-LABEL: define void @store_factor2_wide( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i64> [[V0:%.*]], <8 x i64> [[V1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[PTR]], i32 8 +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP7]], [[TMP9]], [[TMP1]], ptr [[TMP10]]) +; CHECK-NEXT: ret void +; %interleaved.vec = shufflevector <8 x i64> %v0, <8 x i64> %v1, <16 x i32> store <16 x i64> %interleaved.vec, ptr %ptr, align 4 ret void @@ -282,9 +319,27 @@ define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 { ; Check that neon is used for illegal multiples of 128-bit types define void @load_384bit(ptr %ptr) #0 { -; CHECK-LABEL: @load_384bit( -; CHECK: llvm.aarch64.neon.ld2 -; CHECK-NOT: llvm.aarch64.sve.ld2 +; CHECK-LABEL: define void @load_384bit( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[PTR]], i32 4 +; CHECK-NEXT: [[LDN1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP3]], i32 4 +; CHECK-NEXT: [[LDN2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <6 x i32> +; CHECK-NEXT: ret void +; %interleaved.vec = load <12 x i64>, ptr %ptr, align 4 %v0 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> %v1 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> @@ -293,9 +348,13 @@ define void @load_384bit(ptr %ptr) #0 { ; Check that neon is used for 128-bit vectors define void @load_128bit(ptr %ptr) #0 { -; CHECK-LABEL: @load_128bit( -; CHECK: llvm.aarch64.neon.ld2 -; CHECK-NOT: llvm.aarch64.sve.ld2 +; CHECK-LABEL: define void @load_128bit( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 +; CHECK-NEXT: ret void +; %interleaved.vec = load <4 x i64>, ptr %ptr, align 4 %v0 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> %v1 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> @@ -304,8 +363,16 @@ define void @load_128bit(ptr %ptr) #0 { ; Check that correct ptrues are generated for min != max case define void @load_min_not_max(ptr %ptr) #1 { -; CHECK-LABEL: @load_min_not_max( -; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-LABEL: define void @load_min_not_max( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: ret void +; %interleaved.vec = load <8 x i64>, ptr %ptr, align 4 %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> @@ -313,8 +380,16 @@ define void @load_min_not_max(ptr %ptr) #1 { } define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 { -; CHECK-LABEL: @store_min_not_max( -; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-LABEL: define void @store_min_not_max( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> store <8 x i64> %interleaved.vec, ptr %ptr, align 4 ret void @@ -322,8 +397,16 @@ define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 { ; Check that correct ptrues are generated for min > type case define void @load_min_ge_type(ptr %ptr) #2 { -; CHECK-LABEL: @load_min_ge_type( -; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-LABEL: define void @load_min_ge_type( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: ret void +; %interleaved.vec = load <8 x i64>, ptr %ptr, align 4 %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> @@ -331,25 +414,34 @@ define void @load_min_ge_type(ptr %ptr) #2 { } define void @store_min_ge_type(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #2 { -; CHECK-LABEL: @store_min_ge_type( -; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-LABEL: define void @store_min_ge_type( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> store <8 x i64> %interleaved.vec, ptr %ptr, align 4 ret void } define void @load_double_factor4(ptr %ptr) #0 { -; CHECK-LABEL: @load_double_factor4( +; CHECK-LABEL: define void @load_double_factor4( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( [[TMP1]], ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP5]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP7]], i64 0) -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP9]], i64 0) +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2f64( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64( [[TMP8]], i64 0) ; CHECK-NEXT: ret void ; %interleaved.vec = load <16 x double>, ptr %ptr, align 4 @@ -361,15 +453,16 @@ define void @load_double_factor4(ptr %ptr) #0 { } define void @load_float_factor3(ptr %ptr) #0 { -; CHECK-LABEL: @load_float_factor3( +; CHECK-LABEL: define void @load_float_factor3( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4f32( [[TMP1]], ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[LDN]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP5]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[LDN]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP7]], i64 0) +; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4f32( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( [[TMP6]], i64 0) ; CHECK-NEXT: ret void ; %interleaved.vec = load <24 x float>, ptr %ptr, align 4 @@ -380,13 +473,14 @@ define void @load_float_factor3(ptr %ptr) #0 { } define void @load_half_factor2(ptr %ptr) #0 { -; CHECK-LABEL: @load_half_factor2( +; CHECK-LABEL: define void @load_half_factor2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8f16( [[TMP1]], ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16( [[TMP5]], i64 0) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8f16( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16( [[TMP4]], i64 0) ; CHECK-NEXT: ret void ; %interleaved.vec = load <32 x half>, ptr %ptr, align 4 @@ -396,13 +490,14 @@ define void @load_half_factor2(ptr %ptr) #0 { } define void @load_bfloat_factor2(ptr %ptr) #0 { -; CHECK-LABEL: @load_bfloat_factor2( +; CHECK-LABEL: define void @load_bfloat_factor2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8bf16( [[TMP1]], ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16( [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16( [[TMP5]], i64 0) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8bf16( [[TMP1]], ptr [[PTR]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16( [[TMP4]], i64 0) ; CHECK-NEXT: ret void ; %interleaved.vec = load <32 x bfloat>, ptr %ptr, align 4 @@ -412,9 +507,10 @@ define void @load_bfloat_factor2(ptr %ptr) #0 { } define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) #0 { -; CHECK-LABEL: @store_double_factor4( -; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> -; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> +; CHECK-LABEL: define void @store_double_factor4( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x double> [[V0]], <4 x double> [[V1]], <8 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x double> [[V2]], <4 x double> [[V3]], <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv2f64.v4f64( undef, <4 x double> [[TMP2]], i64 0) @@ -424,7 +520,7 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, ; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv2f64.v4f64( undef, <4 x double> [[TMP6]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv2f64.v4f64( undef, <4 x double> [[TMP8]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP9]], [[TMP1]], ptr [[PTR:%.*]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP9]], [[TMP1]], ptr [[PTR]]) ; CHECK-NEXT: ret void ; %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> @@ -435,9 +531,10 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, } define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8 x float> %v2) #0 { -; CHECK-LABEL: @store_float_factor3( -; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <16 x i32> -; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[V2:%.*]], <8 x float> poison, <16 x i32> +; CHECK-LABEL: define void @store_float_factor3( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <8 x float> [[V2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[V0]], <8 x float> [[V1]], <16 x i32> +; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[V2]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f32.v8f32( undef, <8 x float> [[TMP2]], i64 0) @@ -445,7 +542,7 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8 ; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv4f32.v8f32( undef, <8 x float> [[TMP4]], i64 0) ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.insert.nxv4f32.v8f32( undef, <8 x float> [[TMP6]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP1]], ptr [[PTR:%.*]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP3]], [[TMP5]], [[TMP7]], [[TMP1]], ptr [[PTR]]) ; CHECK-NEXT: ret void ; %s0 = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> %v0, <8 x float> %v1, <8 } define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 { -; CHECK-LABEL: @store_half_factor2( +; CHECK-LABEL: define void @store_half_factor2( +; CHECK-SAME: ptr [[PTR:%.*]], <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v16f16( undef, <16 x half> [[TMP2]], i64 0) ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8f16.v16f16( undef, <16 x half> [[TMP4]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR:%.*]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) ; CHECK-NEXT: ret void ; %interleaved.vec = shufflevector <16 x half> %v0, <16 x half> %v1, <32 x i32> %v0, <16 x half> %v1) #0 { define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1) #0 { -; CHECK-LABEL: @store_bfloat_factor2( +; CHECK-LABEL: define void @store_bfloat_factor2( +; CHECK-SAME: ptr [[PTR:%.*]], <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8bf16.v16bf16( undef, <16 x bfloat> [[TMP2]], i64 0) ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8bf16.v16bf16( undef, <16 x bfloat> [[TMP4]], i64 0) -; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR:%.*]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP3]], [[TMP5]], [[TMP1]], ptr [[PTR]]) ; CHECK-NEXT: ret void ; %interleaved.vec = shufflevector <16 x bfloat> %v0, <16 x bfloat> %v1, <32 x i32> %v0, <16 x bfloat> %v1 } ; Ensure vscale_range property does not affect scalable vector types. -define { , } @deinterleave_nxptr_factor2(ptr %ptr) #2 { -; CHECK-LABEL: define { , } @deinterleave_nxptr_factor2( +define void @deinterleave_nxptr_factor2(ptr %ptr) #2 { +; CHECK-LABEL: define void @deinterleave_nxptr_factor2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 ; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) ; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN1]], 0 @@ -506,13 +606,13 @@ define { , } @deinterleave_nxptr_fact ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN2]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) -; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 -; CHECK-NEXT: ret { , } [[TMP12]] +; CHECK-NEXT: ret void ; %wide.vec = load , ptr %ptr, align 8 %ldN = tail call { , } @llvm.vector.deinterleave2.nxv8f64( %wide.vec) - ret { , } %ldN + %extract1 = extractvalue { , } %ldN, 0 + %extract2 = extractvalue { , } %ldN, 1 + ret void } attributes #0 = { vscale_range(2,2) "target-features"="+sve" }