diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 1ccace59d6d36..e37bce3118bcb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -630,6 +630,10 @@ class TargetTransformInfo { AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const; + // Query the target for which minimum vectorization factor epilogue + // vectorization should be considered. + unsigned getEpilogueVectorizationMinVF() const; + /// Query the target whether it would be prefered to create a predicated /// vector loop, which can avoid the need to emit a scalar epilogue loop. bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const; @@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept { AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; + virtual unsigned getEpilogueVectorizationMinVF() = 0; virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0; virtual TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0; @@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { HardwareLoopInfo &HWLoopInfo) override { return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + unsigned getEpilogueVectorizationMinVF() override { + return Impl.getEpilogueVectorizationMinVF(); + } bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override { return Impl.preferPredicateOverEpilogue(TFI); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index c3c5629d61c91..72038c090b792 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -199,6 +199,8 @@ class TargetTransformInfoImplBase { return false; } + unsigned getEpilogueVectorizationMinVF() const { return 16; } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } TailFoldingStyle diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index c2e48284c68ac..3b098c42f2741 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + unsigned getEpilogueVectorizationMinVF() { + return BaseT::getEpilogueVectorizationMinVF(); + } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { return BaseT::preferPredicateOverEpilogue(TFI); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bc6a528c9dab3..174e5e87abe53 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable( return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } +unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const { + return TTIImpl->getEpilogueVectorizationMinVF(); +} + bool TargetTransformInfo::preferPredicateOverEpilogue( TailFoldingInfo *TFI) const { return TTIImpl->preferPredicateOverEpilogue(TFI); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 2a9a7533f8625..e37e2cacc7852 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { MaxBytesForLoopAlignment = 16; break; case NeoverseV2: - // Specialize cost for Neoverse-V2. + case NeoverseV3: + EpilogueVectorizationMinVF = 8; + MaxInterleaveFactor = 4; ScatterOverhead = 13; LLVM_FALLTHROUGH; case NeoverseN2: case NeoverseN3: - case NeoverseV3: PrefFunctionAlignment = Align(16); PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 91fef0e9a1ae9..d860c29e2291a 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool ATTRIBUTE = DEFAULT; #include "AArch64GenSubtargetInfo.inc" + unsigned EpilogueVectorizationMinVF = 16; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 2; uint16_t CacheLineSize = 0; @@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { hasFuseAdrpAdd() || hasFuseLiterals(); } + unsigned getEpilogueVectorizationMinVF() const { + return EpilogueVectorizationMinVF; + } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const; unsigned getCacheLineSize() const override { return CacheLineSize; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 84212b03686b1..ec7bb71fd111f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop, return false; } +unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const { + return ST->getEpilogueVectorizationMinVF(); +} + bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { if (!ST->hasSVE()) return false; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index a01d061c4c407..201bc831b816b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase { return ST->useFixedOverScalableIfEqualCost(); } + unsigned getEpilogueVectorizationMinVF() const; + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI); bool supportsScalableVectors() const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9884dcb97ec5c..fda6550a37548 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -186,7 +186,7 @@ static cl::opt EpilogueVectorizationForceVF( "loops.")); static cl::opt EpilogueVectorizationMinVF( - "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, + "epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization.")); @@ -4701,8 +4701,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // See related "TODO: extend to support scalable VFs." in // selectEpilogueVectorizationFactor. unsigned Multiplier = VF.isFixed() ? IC : 1; - return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= - EpilogueVectorizationMinVF; + unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 + ? EpilogueVectorizationMinVF + : TTI.getEpilogueVectorizationMinVF(); + return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll index 2b881fe19902e..8320608d67588 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -5,6 +5,8 @@ ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s ; Tests for selecting interleave counts for loops with loads and stores. @@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2: exit: ; INTERLEAVE-2-NEXT: ret void ; +; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store( +; INTERLEAVE-4-VLA: call @llvm.smax.nxv16i8( +; INTERLEAVE-4-VLA-NEXT: call @llvm.smax.nxv16i8( +; INTERLEAVE-4-VLA-NEXT: call @llvm.smax.nxv16i8( +; INTERLEAVE-4-VLA-NEXT: call @llvm.smax.nxv16i8( +; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index bf64dccdb2667..fc2f8a0dcabf5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -5,6 +5,8 @@ ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s ; Tests for selecting the interleave count for loops with reductions. @@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-2-NEXT: ret i32 [[RED_NEXT_LCSSA]] ; +; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction( +; INTERLEAVE-4-VLA: add +; INTERLEAVE-4-VLA-NEXT: add +; INTERLEAVE-4-VLA-NEXT: add +; INTERLEAVE-4-VLA-NEXT: add +; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll new file mode 100644 index 0000000000000..9e42c3c5dcab7 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll @@ -0,0 +1,118 @@ +; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-linux-gnu" + +define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 { +; CHECK-LABEL: @V1( +; CHECK-NOT: vec.epilog.ph: +; CHECK-NOT: vec.epilog.vector.body: +; CHECK-NOT: vec.epilog.middle.block: +; CHECK-NOT: vec.epilog.scalar.ph: +; +entry: + %4 = icmp sgt i32 %2, 0 + br i1 %4, label %5, label %8 + +5: + %6 = zext nneg i32 %2 to i64 + br label %9 + +7: + br label %8 + +8: + ret i32 42 + +9: + %10 = phi i64 [ 0, %5 ], [ %16, %9 ] + %11 = getelementptr inbounds double, ptr %0, i64 %10 + %12 = load double, ptr %11, align 8 + %13 = getelementptr inbounds double, ptr %1, i64 %10 + %14 = load double, ptr %13, align 8 + %15 = fadd fast double %14, %12 + store double %15, ptr %11, align 8 + %16 = add nuw nsw i64 %10, 1 + %17 = icmp eq i64 %16, %6 + br i1 %17, label %7, label %9 +} + +define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 { +; +; CHECK-LABEL: @V2( +; CHECK: vec.epilog.ph: +; CHECK: vec.epilog.vector.body: +; CHECK: vec.epilog.middle.block: +; CHECK: vec.epilog.scalar.ph: +; +entry: + %4 = icmp sgt i32 %2, 0 + br i1 %4, label %5, label %8 + +5: + %6 = zext nneg i32 %2 to i64 + br label %9 + +7: + br label %8 + +8: + ret i32 42 + +9: + %10 = phi i64 [ 0, %5 ], [ %16, %9 ] + %11 = getelementptr inbounds double, ptr %0, i64 %10 + %12 = load double, ptr %11, align 8 + %13 = getelementptr inbounds double, ptr %1, i64 %10 + %14 = load double, ptr %13, align 8 + %15 = fadd fast double %14, %12 + store double %15, ptr %11, align 8 + %16 = add nuw nsw i64 %10, 1 + %17 = icmp eq i64 %16, %6 + br i1 %17, label %7, label %9 +} + +; TODO: The V3 will generate a scalable vector body, so doesn't need a +; epilogue loop, but will need to be checked that is really the best thing to +; for the V3. +; +define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 { +; +; CHECK-LABEL: @V3( +; CHECK-NOT: vec.epilog.ph: +; CHECK-NOT: vec.epilog.vector.body: +; CHECK-NOT: vec.epilog.middle.block: +; CHECK-NOT: vec.epilog.scalar.ph: +; +entry: + %4 = icmp sgt i32 %2, 0 + br i1 %4, label %5, label %8 + +5: + %6 = zext nneg i32 %2 to i64 + br label %9 + +7: + br label %8 + +8: + ret i32 42 + +9: + %10 = phi i64 [ 0, %5 ], [ %16, %9 ] + %11 = getelementptr inbounds double, ptr %0, i64 %10 + %12 = load double, ptr %11, align 8 + %13 = getelementptr inbounds double, ptr %1, i64 %10 + %14 = load double, ptr %13, align 8 + %15 = fadd fast double %14, %12 + store double %15, ptr %11, align 8 + %16 = add nuw nsw i64 %10, 1 + %17 = icmp eq i64 %16, %6 + br i1 %17, label %7, label %9 +} + +attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" } + +attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" } + +attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll index 454a9789142f8..52d343e4105c7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ ; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG +; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ ; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG @@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6 ; CHECK-EPILOG: vec.epilog.vector.body: ; CHECK-EPILOG: load +; The epilogue loop gets vectorised vscale x 2 x i16 wide. +; CHECK-EPILOG-V2: vec.epilog.ph: +; CHECK-EPILOG-V2: vec.epilog.vector.body: +; CHECK-EPILOG-V2: load + ; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph: ; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body: entry: