diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0941bf61953f1..2c7929d91121f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -110,11 +110,16 @@ using namespace std::placeholders; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" +STATISTIC(NumFaddVectorized, "Number of vectorized fadd reductions"); STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized", "Controls which SLP graphs should be vectorized."); +static cl::opt SLPEnableOrderedFPReductions( + "slp-ordered-fp-reds", cl::init(true), cl::Hidden, + cl::desc("Enable vectorization of ordered floating point reductions")); + static cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -1850,6 +1855,11 @@ class BoUpSLP { return VectorizableTree.front()->Scalars; } + bool areAllEntriesIdentityOrdered() const { + return all_of(VectorizableTree, + [&](auto &Entry) { return Entry->ReorderIndices.empty(); }); + } + /// Returns the type/is-signed info for the root node in the graph without /// casting. std::optional> getRootNodeTypeWithNoCast() const { @@ -21774,6 +21784,8 @@ class HorizontalReduction { /// signedness. SmallVector> VectorValuesAndScales; + SmallVector InitialFAddValues; + static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); @@ -21787,6 +21799,14 @@ class HorizontalReduction { (match(I, m_LogicalAnd()) || match(I, m_LogicalOr())); } + bool isOrderedFaddReduction() const { + if (!isa(ReductionRoot)) + return false; + auto *I = cast(ReductionRoot); + return (RdxKind == RecurKind::FAdd) && + !I->getFastMathFlags().allowReassoc(); + } + /// Checks if instruction is associative and can be vectorized. static bool isVectorizable(RecurKind Kind, Instruction *I) { if (Kind == RecurKind::None) @@ -21807,6 +21827,9 @@ class HorizontalReduction { if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) return true; + if (Kind == RecurKind::FAdd && SLPEnableOrderedFPReductions) + return true; + return I->isAssociative(); } @@ -22066,6 +22089,37 @@ class HorizontalReduction { (I && !isa(I) && isValidForAlternation(I->getOpcode())); } + bool checkOperandsOrder() const { + auto OpsVec = reverse(ReductionOps[0]); + if (!isOrderedFaddReduction() || empty(OpsVec)) + return false; + Value *PrevOperand = *OpsVec.begin(); + for (auto *I : drop_begin(OpsVec)) { + Value *Op1 = cast(I)->getOperand(0); + if (Op1 != PrevOperand) + return false; + PrevOperand = I; + } + return true; + } + + bool checkFastMathFlags() const { + for (auto OpsVec : ReductionOps) { + if (OpsVec.size() <= 1) + continue; + Value *V = *OpsVec.begin(); + if (!isa(V)) + continue; + bool Flag = cast(V)->getFastMathFlags().allowReassoc(); + auto It = find_if(drop_begin(OpsVec), [&](Value *I) { + auto CurFlag = cast(I)->getFastMathFlags().allowReassoc(); + return (Flag != CurFlag); + }); + if (It != OpsVec.end()) + return false; + } + return true; + } public: HorizontalReduction() = default; @@ -22180,9 +22234,10 @@ class HorizontalReduction { // Add reduction values. The values are sorted for better vectorization // results. for (Value *V : PossibleRedVals) { - size_t Key, Idx; - std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, - /*AllowAlternate=*/false); + size_t Key = 0, Idx = 0; + if (!isOrderedFaddReduction()) + std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, + /*AllowAlternate=*/false); ++PossibleReducedVals[Key][Idx] .insert(std::make_pair(V, 0)) .first->second; @@ -22200,13 +22255,15 @@ class HorizontalReduction { It != E; ++It) { PossibleRedValsVect.emplace_back(); auto RedValsVect = It->second.takeVector(); - stable_sort(RedValsVect, llvm::less_second()); + if (!isOrderedFaddReduction()) + stable_sort(RedValsVect, llvm::less_second()); for (const std::pair &Data : RedValsVect) PossibleRedValsVect.back().append(Data.second, Data.first); } - stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { - return P1.size() > P2.size(); - }); + if (!isOrderedFaddReduction()) + stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { + return P1.size() > P2.size(); + }); int NewIdx = -1; for (ArrayRef Data : PossibleRedValsVect) { if (NewIdx < 0 || @@ -22226,9 +22283,19 @@ class HorizontalReduction { } // Sort the reduced values by number of same/alternate opcode and/or pointer // operand. - stable_sort(ReducedVals, [](ArrayRef P1, ArrayRef P2) { - return P1.size() > P2.size(); - }); + if (!isOrderedFaddReduction()) + stable_sort(ReducedVals, [](ArrayRef P1, ArrayRef P2) { + return P1.size() > P2.size(); + }); + + if (isOrderedFaddReduction() && + (ReducedVals.size() != 1 || ReducedVals[0].size() == 2 || + !checkOperandsOrder())) + return false; + + if (!checkFastMathFlags()) + return false; + return true; } @@ -22423,7 +22490,7 @@ class HorizontalReduction { // original scalar identity operations on matched horizontal reductions). IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul && RdxKind != RecurKind::FMul && - RdxKind != RecurKind::FMulAdd; + RdxKind != RecurKind::FMulAdd && !isOrderedFaddReduction(); // Gather same values. SmallMapVector SameValuesCounter; if (IsSupportedHorRdxIdentityOp) @@ -22524,6 +22591,8 @@ class HorizontalReduction { return IsAnyRedOpGathered; }; bool AnyVectorized = false; + Instruction *RdxRootInst = cast(ReductionRoot);; + Instruction *InsertPt = RdxRootInst; SmallDenseSet, 8> IgnoredCandidates; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { @@ -22684,8 +22753,6 @@ class HorizontalReduction { // Emit a reduction. If the root is a select (min/max idiom), the insert // point is the compare condition of that select. - Instruction *RdxRootInst = cast(ReductionRoot); - Instruction *InsertPt = RdxRootInst; if (IsCmpSelMinMax) InsertPt = GetCmpForMinMaxReduction(RdxRootInst); @@ -22738,6 +22805,41 @@ class HorizontalReduction { if (!V.isVectorized(RdxVal)) RequiredExtract.insert(RdxVal); } + + auto FirstIt = find_if(ReducedVals[0], [&](Value *RdxVal) { + return VectorizedVals.lookup(RdxVal); + }); + auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) { + return VectorizedVals.lookup(RdxVal); + }); + if (isOrderedFaddReduction()) { + //[FirstIt, LastIt] - range of vectorized Vals, we need it to get last + // non-vectorized Val at the beginning and it's ReductionOp and first + // non-vectorized Val at the end and it's ReductinoOp + // fadd - initial value for reduction + // fadd - v + // fadd - v + // fadd - v + // fadd - v + // fadd - scalar remainder + if (LastIt != ReducedVals[0].rend()) + ReductionRoot = + cast(ReducedValsToOps.find(*LastIt)->second[0]); + + if (InitialFAddValues.empty()) { + auto *FAddBinOp = cast( + ReducedValsToOps.find(*FirstIt)->second[0]); + Value *InitialFAddValue = ConstantExpr::getBinOpIdentity( + FAddBinOp->getOpcode(), FAddBinOp->getType()); + if (FirstIt != ReducedVals[0].end()) { + auto *Op1 = FAddBinOp->getOperand(0); + if (!isa(Op1)) + InitialFAddValue = Op1; + } + InitialFAddValues.push_back(InitialFAddValue); + } + } + Pos += ReduxWidth; Start = Pos; ReduxWidth = NumReducedVals - Pos; @@ -22755,10 +22857,27 @@ class HorizontalReduction { continue; } } - if (!VectorValuesAndScales.empty()) - VectorizedTree = GetNewVectorizedTree( - VectorizedTree, - emitReduction(Builder, *TTI, ReductionRoot->getType())); + if (!VectorValuesAndScales.empty()) { + if (!isOrderedFaddReduction()) { + VectorizedTree = GetNewVectorizedTree( + VectorizedTree, + emitReduction(Builder, *TTI, ReductionRoot->getType())); + } else { + for (auto V : VectorValuesAndScales) { + Value *InitialFAddValue = InitialFAddValues.back(); + VectorizedTree = Builder.CreateFAddReduce(InitialFAddValue, std::get<0>(V)); + InitialFAddValues.push_back(VectorizedTree); + } + auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) { + return VectorizedVals.lookup(RdxVal); + }); + for_each(reverse(make_range(LastIt.base(), ReducedVals[0].end())), + [&](Value *V) { + ReducedValsToOps.find(V)->second[0]->moveAfter( + cast(VectorizedTree)); + }); + } + } if (VectorizedTree) { // Reorder operands of bool logical op in the natural order to avoid // possible problem with poison propagation. If not possible to reorder @@ -22846,15 +22965,18 @@ class HorizontalReduction { ExtraReductions.emplace_back(RedOp, RdxVal); } } - // Iterate through all not-vectorized reduction values/extra arguments. - bool InitStep = true; - while (ExtraReductions.size() > 1) { - SmallVector> NewReds = - FinalGen(ExtraReductions, InitStep); - ExtraReductions.swap(NewReds); - InitStep = false; + + if (!isOrderedFaddReduction()) { + // Iterate through all not-vectorized reduction values/extra arguments. + bool InitStep = true; + while (ExtraReductions.size() > 1) { + SmallVector> NewReds = + FinalGen(ExtraReductions, InitStep); + ExtraReductions.swap(NewReds); + InitStep = false; + } + VectorizedTree = ExtraReductions.front().second; } - VectorizedTree = ExtraReductions.front().second; ReductionRoot->replaceAllUsesWith(VectorizedTree); @@ -22868,21 +22990,28 @@ class HorizontalReduction { IgnoreSet.insert_range(RdxOps); #endif for (ArrayRef RdxOps : ReductionOps) { + SmallVector RdxOpsForDeletion; for (Value *Ignore : RdxOps) { - if (!Ignore) + if (!Ignore || (isOrderedFaddReduction() && !Ignore->use_empty() && + !any_of(cast(Ignore)->operands(), + [](const Value *Val) { + return isa(Val); + }))) continue; #ifndef NDEBUG for (auto *U : Ignore->users()) { - assert(IgnoreSet.count(U) && - "All users must be either in the reduction ops list."); + assert((IgnoreSet.count(U) || + isOrderedFaddReduction()) && + "All users must be either in the reduction ops list."); } #endif if (!Ignore->use_empty()) { Value *P = PoisonValue::get(Ignore->getType()); Ignore->replaceAllUsesWith(P); } + RdxOpsForDeletion.push_back(Ignore); } - V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales); + V.removeInstructionsAndOperands(ArrayRef(RdxOpsForDeletion), VectorValuesAndScales); } } else if (!CheckForReusedReductionOps) { for (ReductionOpsType &RdxOps : ReductionOps) @@ -22961,6 +23090,8 @@ class HorizontalReduction { continue; } InstructionCost ScalarCost = 0; + if (RdxVal->use_empty()) + continue; for (User *U : RdxVal->users()) { auto *RdxOp = cast(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index f16c879c451c2..8f541a3dface3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -10,21 +10,10 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot4f64( -; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2 -; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[DOT0123:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]]) ; CHECK-NEXT: ret double [[DOT0123]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -53,21 +42,10 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot4f32( -; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2 -; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[DOT0123:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: ret float [[DOT0123]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 17ae33652b6d8..c1a0c293ef9b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -136,44 +136,39 @@ for.end: ; preds = %for.body define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 ; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 2 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP18]], <2 x float> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], ; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL25:%.*]] = fmul float [[TMP8]], 1.100000e+01 +; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL25]] ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[TMP17]], [[ADD6]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: @@ -237,19 +232,13 @@ define float @sort_phi_type(ptr nocapture readonly %A) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ splat (float 1.000000e+01), [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2]] = fmul <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 +; CHECK-NEXT: [[TMP2]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]] +; CHECK-NEXT: [[ADD31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP1]]) ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll new file mode 100644 index 0000000000000..46aba65eb1b29 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define float @_Z3fooPi(ptr %a){ +; CHECK-LABEL: define float @_Z3fooPi( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i32> [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fdiv <8 x double> [[TMP2]], splat (double 1.000000e-01) +; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_8]], align 4 +; CHECK-NEXT: [[MUL_8:%.*]] = mul nsw i32 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[CONV_8:%.*]] = uitofp nneg i32 [[MUL_8]] to double +; CHECK-NEXT: [[DIV_8:%.*]] = fdiv double [[CONV_8]], 1.000000e-01 +; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 36 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_9]], align 4 +; CHECK-NEXT: [[MUL_9:%.*]] = mul nsw i32 [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[CONV_9:%.*]] = uitofp nneg i32 [[MUL_9]] to double +; CHECK-NEXT: [[OP_RDX:%.*]] = fdiv double [[CONV_9]], 1.000000e-01 +; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP3]]) +; CHECK-NEXT: [[ADD_8:%.*]] = fadd double [[TMP7]], [[DIV_8]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd double [[ADD_8]], [[OP_RDX]] +; CHECK-NEXT: [[DIV4:%.*]] = fdiv double [[OP_RDX1]], 5.000000e+03 +; CHECK-NEXT: [[SUB:%.*]] = fadd double [[DIV4]], -5.000000e+03 +; CHECK-NEXT: [[CONV6:%.*]] = fptrunc double [[SUB]] to float +; CHECK-NEXT: ret float [[CONV6]] +; +entry: + %0 = load i32, ptr %a, align 4 + %mul = mul nsw i32 %0, %0 + %conv = uitofp nneg i32 %mul to double + %div = fdiv double %conv, 1.000000e-01 + %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 4 + %1 = load i32, ptr %arrayidx.1, align 4 + %mul.1 = mul nsw i32 %1, %1 + %conv.1 = uitofp nneg i32 %mul.1 to double + %div.1 = fdiv double %conv.1, 1.000000e-01 + %add.1 = fadd double %div, %div.1 + %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 8 + %2 = load i32, ptr %arrayidx.2, align 4 + %mul.2 = mul nsw i32 %2, %2 + %conv.2 = uitofp nneg i32 %mul.2 to double + %div.2 = fdiv double %conv.2, 1.000000e-01 + %add.2 = fadd double %add.1, %div.2 + %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 12 + %3 = load i32, ptr %arrayidx.3, align 4 + %mul.3 = mul nsw i32 %3, %3 + %conv.3 = uitofp nneg i32 %mul.3 to double + %div.3 = fdiv double %conv.3, 1.000000e-01 + %add.3 = fadd double %add.2, %div.3 + %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 16 + %4 = load i32, ptr %arrayidx.4, align 4 + %mul.4 = mul nsw i32 %4, %4 + %conv.4 = uitofp nneg i32 %mul.4 to double + %div.4 = fdiv double %conv.4, 1.000000e-01 + %add.4 = fadd double %add.3, %div.4 + %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 20 + %5 = load i32, ptr %arrayidx.5, align 4 + %mul.5 = mul nsw i32 %5, %5 + %conv.5 = uitofp nneg i32 %mul.5 to double + %div.5 = fdiv double %conv.5, 1.000000e-01 + %add.5 = fadd double %add.4, %div.5 + %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 24 + %6 = load i32, ptr %arrayidx.6, align 4 + %mul.6 = mul nsw i32 %6, %6 + %conv.6 = uitofp nneg i32 %mul.6 to double + %div.6 = fdiv double %conv.6, 1.000000e-01 + %add.6 = fadd double %add.5, %div.6 + %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 28 + %7 = load i32, ptr %arrayidx.7, align 4 + %mul.7 = mul nsw i32 %7, %7 + %conv.7 = uitofp nneg i32 %mul.7 to double + %div.7 = fdiv double %conv.7, 1.000000e-01 + %add.7 = fadd double %add.6, %div.7 + %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 32 + %8 = load i32, ptr %arrayidx.8, align 4 + %mul.8 = mul nsw i32 %8, %8 + %conv.8 = uitofp nneg i32 %mul.8 to double + %div.8 = fdiv double %conv.8, 1.000000e-01 + %add.8 = fadd double %add.7, %div.8 + %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 36 + %9 = load i32, ptr %arrayidx.9, align 4 + %mul.9 = mul nsw i32 %9, %9 + %conv.9 = uitofp nneg i32 %mul.9 to double + %div.9 = fdiv double %conv.9, 1.000000e-01 + %add.9 = fadd double %add.8, %div.9 + %div4 = fdiv double %add.9, 5.000000e+03 + %sub = fadd double %div4, -5.000000e+03 + %conv6 = fptrunc double %sub to float + ret float %conv6 +} diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll new file mode 100644 index 0000000000000..356480bc11591 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define float @test_reduce(ptr %a) { +; CHECK-LABEL: define float @test_reduce( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]]) +; CHECK-NEXT: ret float [[TMP1]] +; +entry: + %0 = load float, ptr %a, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd float %add, %2 + %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd float %add3, %3 + ret float %add5 +} + +define float @test_no_reduce(ptr %a) { +; CHECK-LABEL: define float @test_no_reduce( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[ADD4]] +; CHECK-NEXT: ret float [[ADD5]] +; +entry: + %0 = load float, ptr %a, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8 + %2 = load float, ptr %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i8, ptr %a, i64 12 + %3 = load float, ptr %arrayidx3, align 4 + %add4 = fadd float %2, %3 + %add5 = fadd float %add, %add4 + ret float %add5 +} + +define float @test_reduce2(ptr %a, float %b) { +; CHECK-LABEL: define float @test_reduce2( +; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]]) +; CHECK-NEXT: [[ADDB:%.*]] = fadd float [[TMP1]], [[B]] +; CHECK-NEXT: ret float [[TMP1]] +; +entry: + %0 = load float, ptr %a, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd float %add, %2 + %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd float %add3, %3 + %addb = fadd float %add5, %b + ret float %add5 +} + +define float @test_reduce_multiple_use(ptr %a, float %b) { +; CHECK-LABEL: define float @test_reduce_multiple_use( +; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[ADDC:%.*]] = fadd float [[B]], [[TMP1]] +; CHECK-NEXT: [[ADD6:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[ADDC]], <4 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd float [[ADD6]], [[B]] +; CHECK-NEXT: ret float [[OP_RDX1]] +; +entry: + %0 = load float, ptr %a, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4 + %1 = load float, ptr %arrayidx1, align 4 + %addc = fadd float %b, %0 + %addb = fadd float %addc, %0 + %add = fadd float %addb, %1 + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd float %add, %2 + %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd float %add3, %3 + %add6 = fadd float %add5, %b + ret float %add6 +} + +define double @test_reduce_multiple_reductions(ptr %freq, double %sum) { +; CHECK-LABEL: define double @test_reduce_multiple_reductions( +; CHECK-SAME: ptr [[FREQ:%.*]], double [[SUM:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x double>, ptr [[FREQ]], align 8 +; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 256 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x double>, ptr [[ARRAYIDX_32]], align 8 +; CHECK-NEXT: [[ARRAYIDX_48:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 384 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x double>, ptr [[ARRAYIDX_48]], align 8 +; CHECK-NEXT: [[ARRAYIDX_56:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 448 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x double>, ptr [[ARRAYIDX_56]], align 8 +; CHECK-NEXT: [[ARRAYIDX_60:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 480 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX_60]], align 8 +; CHECK-NEXT: [[ARRAYIDX_61:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 488 +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX_61]], align 8 +; CHECK-NEXT: [[ARRAYIDX_62:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 496 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX_62]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> [[TMP0]]) +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.vector.reduce.fadd.v16f64(double [[TMP7]], <16 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double [[TMP8]], <8 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP13:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double [[TMP9]], <4 x double> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd double [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[ADD_61:%.*]] = fadd double [[OP_RDX]], [[TMP5]] +; CHECK-NEXT: [[ADD_62:%.*]] = fadd double [[ADD_61]], [[TMP6]] +; CHECK-NEXT: ret double [[ADD_62]] +; +entry: + %0 = load double, ptr %freq, align 8 + %arrayidx.1 = getelementptr inbounds i8, ptr %freq, i64 8 + %1 = load double, ptr %arrayidx.1, align 8 + %add.1 = fadd double %0, %1 + %arrayidx.2 = getelementptr inbounds i8, ptr %freq, i64 16 + %2 = load double, ptr %arrayidx.2, align 8 + %add.2 = fadd double %add.1, %2 + %arrayidx.3 = getelementptr inbounds i8, ptr %freq, i64 24 + %3 = load double, ptr %arrayidx.3, align 8 + %add.3 = fadd double %add.2, %3 + %arrayidx.4 = getelementptr inbounds i8, ptr %freq, i64 32 + %4 = load double, ptr %arrayidx.4, align 8 + %add.4 = fadd double %add.3, %4 + %arrayidx.5 = getelementptr inbounds i8, ptr %freq, i64 40 + %5 = load double, ptr %arrayidx.5, align 8 + %add.5 = fadd double %add.4, %5 + %arrayidx.6 = getelementptr inbounds i8, ptr %freq, i64 48 + %6 = load double, ptr %arrayidx.6, align 8 + %add.6 = fadd double %add.5, %6 + %arrayidx.7 = getelementptr inbounds i8, ptr %freq, i64 56 + %7 = load double, ptr %arrayidx.7, align 8 + %add.7 = fadd double %add.6, %7 + %arrayidx.8 = getelementptr inbounds i8, ptr %freq, i64 64 + %8 = load double, ptr %arrayidx.8, align 8 + %add.8 = fadd double %add.7, %8 + %arrayidx.9 = getelementptr inbounds i8, ptr %freq, i64 72 + %9 = load double, ptr %arrayidx.9, align 8 + %add.9 = fadd double %add.8, %9 + %arrayidx.10 = getelementptr inbounds i8, ptr %freq, i64 80 + %10 = load double, ptr %arrayidx.10, align 8 + %add.10 = fadd double %add.9, %10 + %arrayidx.11 = getelementptr inbounds i8, ptr %freq, i64 88 + %11 = load double, ptr %arrayidx.11, align 8 + %add.11 = fadd double %add.10, %11 + %arrayidx.12 = getelementptr inbounds i8, ptr %freq, i64 96 + %12 = load double, ptr %arrayidx.12, align 8 + %add.12 = fadd double %add.11, %12 + %arrayidx.13 = getelementptr inbounds i8, ptr %freq, i64 104 + %13 = load double, ptr %arrayidx.13, align 8 + %add.13 = fadd double %add.12, %13 + %arrayidx.14 = getelementptr inbounds i8, ptr %freq, i64 112 + %14 = load double, ptr %arrayidx.14, align 8 + %add.14 = fadd double %add.13, %14 + %arrayidx.15 = getelementptr inbounds i8, ptr %freq, i64 120 + %15 = load double, ptr %arrayidx.15, align 8 + %add.15 = fadd double %add.14, %15 + %arrayidx.16 = getelementptr inbounds i8, ptr %freq, i64 128 + %16 = load double, ptr %arrayidx.16, align 8 + %add.16 = fadd double %add.15, %16 + %arrayidx.17 = getelementptr inbounds i8, ptr %freq, i64 136 + %17 = load double, ptr %arrayidx.17, align 8 + %add.17 = fadd double %add.16, %17 + %arrayidx.18 = getelementptr inbounds i8, ptr %freq, i64 144 + %18 = load double, ptr %arrayidx.18, align 8 + %add.18 = fadd double %add.17, %18 + %arrayidx.19 = getelementptr inbounds i8, ptr %freq, i64 152 + %19 = load double, ptr %arrayidx.19, align 8 + %add.19 = fadd double %add.18, %19 + %arrayidx.20 = getelementptr inbounds i8, ptr %freq, i64 160 + %20 = load double, ptr %arrayidx.20, align 8 + %add.20 = fadd double %add.19, %20 + %arrayidx.21 = getelementptr inbounds i8, ptr %freq, i64 168 + %21 = load double, ptr %arrayidx.21, align 8 + %add.21 = fadd double %add.20, %21 + %arrayidx.22 = getelementptr inbounds i8, ptr %freq, i64 176 + %22 = load double, ptr %arrayidx.22, align 8 + %add.22 = fadd double %add.21, %22 + %arrayidx.23 = getelementptr inbounds i8, ptr %freq, i64 184 + %23 = load double, ptr %arrayidx.23, align 8 + %add.23 = fadd double %add.22, %23 + %arrayidx.24 = getelementptr inbounds i8, ptr %freq, i64 192 + %24 = load double, ptr %arrayidx.24, align 8 + %add.24 = fadd double %add.23, %24 + %arrayidx.25 = getelementptr inbounds i8, ptr %freq, i64 200 + %25 = load double, ptr %arrayidx.25, align 8 + %add.25 = fadd double %add.24, %25 + %arrayidx.26 = getelementptr inbounds i8, ptr %freq, i64 208 + %26 = load double, ptr %arrayidx.26, align 8 + %add.26 = fadd double %add.25, %26 + %arrayidx.27 = getelementptr inbounds i8, ptr %freq, i64 216 + %27 = load double, ptr %arrayidx.27, align 8 + %add.27 = fadd double %add.26, %27 + %arrayidx.28 = getelementptr inbounds i8, ptr %freq, i64 224 + %28 = load double, ptr %arrayidx.28, align 8 + %add.28 = fadd double %add.27, %28 + %arrayidx.29 = getelementptr inbounds i8, ptr %freq, i64 232 + %29 = load double, ptr %arrayidx.29, align 8 + %add.29 = fadd double %add.28, %29 + %arrayidx.30 = getelementptr inbounds i8, ptr %freq, i64 240 + %30 = load double, ptr %arrayidx.30, align 8 + %add.30 = fadd double %add.29, %30 + %arrayidx.31 = getelementptr inbounds i8, ptr %freq, i64 248 + %31 = load double, ptr %arrayidx.31, align 8 + %add.31 = fadd double %add.30, %31 + %arrayidx.32 = getelementptr inbounds i8, ptr %freq, i64 256 + %32 = load double, ptr %arrayidx.32, align 8 + %add.32 = fadd double %add.31, %32 + %arrayidx.33 = getelementptr inbounds i8, ptr %freq, i64 264 + %33 = load double, ptr %arrayidx.33, align 8 + %add.33 = fadd double %add.32, %33 + %arrayidx.34 = getelementptr inbounds i8, ptr %freq, i64 272 + %34 = load double, ptr %arrayidx.34, align 8 + %add.34 = fadd double %add.33, %34 + %arrayidx.35 = getelementptr inbounds i8, ptr %freq, i64 280 + %35 = load double, ptr %arrayidx.35, align 8 + %add.35 = fadd double %add.34, %35 + %arrayidx.36 = getelementptr inbounds i8, ptr %freq, i64 288 + %36 = load double, ptr %arrayidx.36, align 8 + %add.36 = fadd double %add.35, %36 + %arrayidx.37 = getelementptr inbounds i8, ptr %freq, i64 296 + %37 = load double, ptr %arrayidx.37, align 8 + %add.37 = fadd double %add.36, %37 + %arrayidx.38 = getelementptr inbounds i8, ptr %freq, i64 304 + %38 = load double, ptr %arrayidx.38, align 8 + %add.38 = fadd double %add.37, %38 + %arrayidx.39 = getelementptr inbounds i8, ptr %freq, i64 312 + %39 = load double, ptr %arrayidx.39, align 8 + %add.39 = fadd double %add.38, %39 + %arrayidx.40 = getelementptr inbounds i8, ptr %freq, i64 320 + %40 = load double, ptr %arrayidx.40, align 8 + %add.40 = fadd double %add.39, %40 + %arrayidx.41 = getelementptr inbounds i8, ptr %freq, i64 328 + %41 = load double, ptr %arrayidx.41, align 8 + %add.41 = fadd double %add.40, %41 + %arrayidx.42 = getelementptr inbounds i8, ptr %freq, i64 336 + %42 = load double, ptr %arrayidx.42, align 8 + %add.42 = fadd double %add.41, %42 + %arrayidx.43 = getelementptr inbounds i8, ptr %freq, i64 344 + %43 = load double, ptr %arrayidx.43, align 8 + %add.43 = fadd double %add.42, %43 + %arrayidx.44 = getelementptr inbounds i8, ptr %freq, i64 352 + %44 = load double, ptr %arrayidx.44, align 8 + %add.44 = fadd double %add.43, %44 + %arrayidx.45 = getelementptr inbounds i8, ptr %freq, i64 360 + %45 = load double, ptr %arrayidx.45, align 8 + %add.45 = fadd double %add.44, %45 + %arrayidx.46 = getelementptr inbounds i8, ptr %freq, i64 368 + %46 = load double, ptr %arrayidx.46, align 8 + %add.46 = fadd double %add.45, %46 + %arrayidx.47 = getelementptr inbounds i8, ptr %freq, i64 376 + %47 = load double, ptr %arrayidx.47, align 8 + %add.47 = fadd double %add.46, %47 + %arrayidx.48 = getelementptr inbounds i8, ptr %freq, i64 384 + %48 = load double, ptr %arrayidx.48, align 8 + %add.48 = fadd double %add.47, %48 + %arrayidx.49 = getelementptr inbounds i8, ptr %freq, i64 392 + %49 = load double, ptr %arrayidx.49, align 8 + %add.49 = fadd double %add.48, %49 + %arrayidx.50 = getelementptr inbounds i8, ptr %freq, i64 400 + %50 = load double, ptr %arrayidx.50, align 8 + %add.50 = fadd double %add.49, %50 + %arrayidx.51 = getelementptr inbounds i8, ptr %freq, i64 408 + %51 = load double, ptr %arrayidx.51, align 8 + %add.51 = fadd double %add.50, %51 + %arrayidx.52 = getelementptr inbounds i8, ptr %freq, i64 416 + %52 = load double, ptr %arrayidx.52, align 8 + %add.52 = fadd double %add.51, %52 + %arrayidx.53 = getelementptr inbounds i8, ptr %freq, i64 424 + %53 = load double, ptr %arrayidx.53, align 8 + %add.53 = fadd double %add.52, %53 + %arrayidx.54 = getelementptr inbounds i8, ptr %freq, i64 432 + %54 = load double, ptr %arrayidx.54, align 8 + %add.54 = fadd double %add.53, %54 + %arrayidx.55 = getelementptr inbounds i8, ptr %freq, i64 440 + %55 = load double, ptr %arrayidx.55, align 8 + %add.55 = fadd double %add.54, %55 + %arrayidx.56 = getelementptr inbounds i8, ptr %freq, i64 448 + %56 = load double, ptr %arrayidx.56, align 8 + %add.56 = fadd double %add.55, %56 + %arrayidx.57 = getelementptr inbounds i8, ptr %freq, i64 456 + %57 = load double, ptr %arrayidx.57, align 8 + %add.57 = fadd double %add.56, %57 + %arrayidx.58 = getelementptr inbounds i8, ptr %freq, i64 464 + %58 = load double, ptr %arrayidx.58, align 8 + %add.58 = fadd double %add.57, %58 + %arrayidx.59 = getelementptr inbounds i8, ptr %freq, i64 472 + %59 = load double, ptr %arrayidx.59, align 8 + %add.59 = fadd double %add.58, %59 + %arrayidx.60 = getelementptr inbounds i8, ptr %freq, i64 480 + %60 = load double, ptr %arrayidx.60, align 8 + %add.60 = fadd double %add.59, %60 + %arrayidx.61 = getelementptr inbounds i8, ptr %freq, i64 488 + %61 = load double, ptr %arrayidx.61, align 8 + %add.61 = fadd double %add.60, %61 + %arrayidx.62 = getelementptr inbounds i8, ptr %freq, i64 496 + %62 = load double, ptr %arrayidx.62, align 8 + %add.62 = fadd double %add.61, %62 + ret double %add.62 +}