diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3c868dbbf8b3a..96ec2dcbb715b 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1324,6 +1324,12 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<"predictable-select-expensive", "PredictableSelectIsExpensive", "true", "Prefer likely predicted branches over selects">; +def FeatureUseFixedOverScalableIfEqualCost + : SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", + "UseFixedOverScalableIfEqualCost", "true", + "Prefer fixed width loop vectorization over scalable" + "if the cost-model assigns equal costs">; + def TuneOptimizedZeroStrideLoad : SubtargetFeature<"optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", "true", "Optimized (perform fewer memory operations)" diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 9c37a4f6ec2d0..fffae92e78b2f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -342,6 +342,10 @@ class RISCVTTIImpl : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() { return true; } + bool preferFixedOverScalableIfEqualCost() const { + return ST->useFixedOverScalableIfEqualCost(); + } + enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; unsigned getNumberOfRegisters(unsigned ClassID) const { switch (ClassID) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/prefer-fixed-if-equal-to-scalable.ll b/llvm/test/Transforms/LoopVectorize/RISCV/prefer-fixed-if-equal-to-scalable.ll new file mode 100644 index 0000000000000..eebd34958905c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/prefer-fixed-if-equal-to-scalable.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple riscv64 -S -passes=loop-vectorize -force-target-instruction-cost=1 < %s \ +; RUN: -mattr=+v | FileCheck %s -check-prefix=SCALABLE +; RUN: opt -mtriple riscv64 -S -passes=loop-vectorize -force-target-instruction-cost=1 < %s \ +; RUN: -mattr=+v,+use-fixed-over-scalable-if-equal-cost \ +; RUN: | FileCheck %s -check-prefix=FIXED + +define void @s000(ptr %a, ptr %b, i32 %n) { +; SCALABLE-LABEL: define void @s000( +; SCALABLE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; SCALABLE-NEXT: [[ENTRY:.*:]] +; SCALABLE-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 +; SCALABLE-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 +; SCALABLE-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; SCALABLE-NEXT: br i1 [[CMP6]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; SCALABLE: [[FOR_BODY_PREHEADER]]: +; SCALABLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP2]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; SCALABLE: [[VECTOR_MEMCHECK]]: +; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; SCALABLE-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]] +; SCALABLE-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; SCALABLE-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; SCALABLE: [[VECTOR_PH]]: +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP8]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] +; SCALABLE: [[VECTOR_BODY]]: +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[TMP14:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 1.000000e+00, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; SCALABLE-NEXT: store [[TMP14]], ptr [[TMP16]], align 4 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE: [[MIDDLE_BLOCK]]: +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; SCALABLE: [[SCALAR_PH]]: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] +; SCALABLE: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; SCALABLE-NEXT: br label %[[FOR_COND_CLEANUP]] +; SCALABLE: [[FOR_COND_CLEANUP]]: +; SCALABLE-NEXT: ret void +; SCALABLE: [[FOR_BODY]]: +; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[ADD:%.*]] = fadd float [[TMP18]], 1.000000e+00 +; SCALABLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 +; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; FIXED-LABEL: define void @s000( +; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-NEXT: [[ENTRY:.*:]] +; FIXED-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 +; FIXED-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 +; FIXED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; FIXED-NEXT: br i1 [[CMP6]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; FIXED: [[FOR_BODY_PREHEADER]]: +; FIXED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; FIXED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; FIXED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; FIXED: [[VECTOR_MEMCHECK]]: +; FIXED-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] +; FIXED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 +; FIXED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; FIXED: [[VECTOR_PH]]: +; FIXED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; FIXED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; FIXED-NEXT: br label %[[VECTOR_BODY:.*]] +; FIXED: [[VECTOR_BODY]]: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; FIXED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]] +; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; FIXED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP6]], align 4 +; FIXED-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP8:%.*]] = fadd <8 x float> [[WIDE_LOAD3]], +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; FIXED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; FIXED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 8 +; FIXED-NEXT: store <8 x float> [[TMP7]], ptr [[TMP11]], align 4 +; FIXED-NEXT: store <8 x float> [[TMP8]], ptr [[TMP12]], align 4 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FIXED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED: [[MIDDLE_BLOCK]]: +; FIXED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; FIXED-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; FIXED: [[SCALAR_PH]]: +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; FIXED-NEXT: br label %[[FOR_BODY:.*]] +; FIXED: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; FIXED-NEXT: br label %[[FOR_COND_CLEANUP]] +; FIXED: [[FOR_COND_CLEANUP]]: +; FIXED-NEXT: ret void +; FIXED: [[FOR_BODY]]: +; FIXED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; FIXED-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; FIXED-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], 1.000000e+00 +; FIXED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; FIXED-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 +; FIXED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FIXED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FIXED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %add = fadd float %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv + store float %add, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} +;. +; SCALABLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; SCALABLE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; SCALABLE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; SCALABLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. +; FIXED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; FIXED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; FIXED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; FIXED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;.