diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 08ab4ee2ec1cf..9048481b49189 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1473,6 +1473,12 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind, unsigned Index = -1) const; + /// \return The expected cost of aggregate inserts and extracts. This is + /// used when the instruction is not available; a typical use case is to + /// provision the cost of vectorization/scalarization in vectorizer passes. + InstructionCost getInsertExtractValueCost(unsigned Opcode, + TTI::TargetCostKind CostKind) const; + /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. /// @@ -2223,6 +2229,9 @@ class TargetTransformInfo::Concept { const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost + getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -2950,6 +2959,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF, DemandedDstElts, CostKind); } + InstructionCost + getInsertExtractValueCost(unsigned Opcode, + TTI::TargetCostKind CostKind) override { + return Impl.getInsertExtractValueCost(Opcode, CostKind); + } InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 5128c6b86a5f0..a8d6dd18266bb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -745,6 +745,17 @@ class TargetTransformInfoImplBase { return 1; } + InstructionCost + getInsertExtractValueCost(unsigned Opcode, + TTI::TargetCostKind CostKind) const { + // Note: The `insertvalue` cost here is chosen to match the default case of + // getInstructionCost() -- as pior to adding this helper `insertvalue` was + // not handled. + if (Opcode == Instruction::InsertValue) + return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic; + return TTI::TCC_Free; + } + InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -1306,9 +1317,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { case Instruction::PHI: case Instruction::Switch: return TargetTTI->getCFInstrCost(Opcode, CostKind, I); - case Instruction::ExtractValue: case Instruction::Freeze: return TTI::TCC_Free; + case Instruction::ExtractValue: + case Instruction::InsertValue: + return TargetTTI->getInsertExtractValueCost(Opcode, CostKind); case Instruction::Alloca: if (cast(U)->isStaticAlloca()) return TTI::TCC_Free; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 3c5cf1ebe6ba2..e959d93b57275 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -416,10 +416,6 @@ class LoopVectorizationLegality { /// has a vectorized variant available. bool hasVectorCallVariants() const { return VecCallVariantsFound; } - /// Returns true if there is at least one function call in the loop which - /// returns a struct type and needs to be vectorized. - bool hasStructVectorCall() const { return StructVecCallFound; } - unsigned getNumStores() const { return LAI->getNumStores(); } unsigned getNumLoads() const { return LAI->getNumLoads(); } @@ -639,12 +635,6 @@ class LoopVectorizationLegality { /// the use of those function variants. bool VecCallVariantsFound = false; - /// If we find a call (to be vectorized) that returns a struct type, record - /// that so we can bail out until this is supported. - /// TODO: Remove this flag once vectorizing calls with struct returns is - /// supported. - bool StructVecCallFound = false; - /// Keep track of all the countable and uncountable exiting blocks if /// the exact backedge taken count is not computable. SmallVector CountableExitingBlocks; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index dc066099bdc1d..1ca9a16b18112 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, return Cost; } +InstructionCost TargetTransformInfo::getInsertExtractValueCost( + unsigned Opcode, TTI::TargetCostKind CostKind) const { + assert((Opcode == Instruction::InsertValue || + Opcode == Instruction::ExtractValue) && + "Expecting Opcode to be insertvalue/extractvalue."); + InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getReplicationShuffleCost( Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index e3599315e224f..420cbc5384ce4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -954,7 +954,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !VFDatabase::getMappings(*CI).empty()) VecCallVariantsFound = true; - auto CanWidenInstructionTy = [this](Instruction const &Inst) { + auto CanWidenInstructionTy = [](Instruction const &Inst) { Type *InstTy = Inst.getType(); if (!isa(InstTy)) return canVectorizeTy(InstTy); @@ -962,15 +962,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // For now, we only recognize struct values returned from calls where // all users are extractvalue as vectorizable. All element types of the // struct must be types that can be widened. - if (isa(Inst) && canWidenCallReturnType(InstTy) && - all_of(Inst.users(), IsaPred)) { - // TODO: Remove the `StructVecCallFound` flag once vectorizing calls - // with struct returns is supported. - StructVecCallFound = true; - return true; - } - - return false; + return isa(Inst) && canWidenCallReturnType(InstTy) && + all_of(Inst.users(), IsaPred); }; // Check that the instruction return type is vectorizable. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 660a6ef574576..0faf30df4455e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2389,7 +2389,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State) { - assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + assert((!Instr->getType()->isAggregateType() || + canVectorizeTy(Instr->getType())) && + "Expected vectorizable or non-aggregate type."); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2894,10 +2896,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, return ScalarCallCost; } -static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); +static Type *maybeVectorizeType(Type *Ty, ElementCount VF) { + if (VF.isScalar() || !canVectorizeTy(Ty)) + return Ty; + return toVectorizedTy(Ty, VF); } InstructionCost @@ -3644,13 +3646,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { } } - // ExtractValue instructions must be uniform, because the operands are - // known to be loop-invariant. if (auto *EVI = dyn_cast(&I)) { - assert(IsOutOfScope(EVI->getAggregateOperand()) && - "Expected aggregate value to be loop invariant"); - AddToWorklistIfAllowed(EVI); - continue; + if (IsOutOfScope(EVI->getAggregateOperand())) { + AddToWorklistIfAllowed(EVI); + continue; + } + // Only ExtractValue instructions where the aggregate value comes from a + // call are allowed to be non-uniform. + assert(isa(EVI->getAggregateOperand()) && + "Expected aggregate value to be call return value"); } // If there's no pointer operand, there's nothing to do. @@ -4513,8 +4517,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, llvm_unreachable("unhandled recipe"); } - auto WillWiden = [&TTI, VF](Type *ScalarTy) { - Type *VectorTy = toVectorTy(ScalarTy, VF); + auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) { unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); if (!NumLegalParts) return false; @@ -4526,7 +4529,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // explicitly ask TTI about the register class uses for each part. return NumLegalParts <= VF.getKnownMinValue(); } - // Two or more parts that share a register - are vectorized. + // Two or more elements that share a register - are vectorized. return NumLegalParts < VF.getKnownMinValue(); }; @@ -4545,7 +4548,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); if (!Visited.insert({ScalarTy}).second) continue; - if (WillWiden(ScalarTy)) + Type *WideTy = toVectorizedTy(ScalarTy, VF); + if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors)) return true; } } @@ -5503,10 +5507,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { - ScalarCost += TTI.getScalarizationOverhead( - cast(toVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, - /*Extract*/ false, CostKind); + Type *WideTy = toVectorizedTy(I->getType(), VF); + for (Type *VectorTy : getContainedTypes(WideTy)) { + ScalarCost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, CostKind); + } ScalarCost += VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -5517,15 +5524,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // overhead. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) { - assert(VectorType::isValidElementType(J->getType()) && + assert(canVectorizeTy(J->getType()) && "Instruction has non-scalar type"); if (CanBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J, VF)) { - ScalarCost += TTI.getScalarizationOverhead( - cast(toVectorTy(J->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, - /*Extract*/ true, CostKind); + Type *WideTy = toVectorizedTy(J->getType(), VF); + for (Type *VectorTy : getContainedTypes(WideTy)) { + ScalarCost += TTI.getScalarizationOverhead( + cast(VectorTy), + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, + /*Extract*/ true, CostKind); + } } } @@ -6004,13 +6014,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, return 0; InstructionCost Cost = 0; - Type *RetTy = toVectorTy(I->getType(), VF); + Type *RetTy = toVectorizedTy(I->getType(), VF); if (!RetTy->isVoidTy() && - (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert*/ true, - /*Extract*/ false, CostKind); + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) { + + for (Type *VectorTy : getContainedTypes(RetTy)) { + Cost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert=*/true, + /*Extract=*/false, CostKind); + } + } // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6268,9 +6282,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool MaskRequired = Legal->isMaskRequired(CI); // Compute corresponding vector type for return value and arguments. - Type *RetTy = toVectorTy(ScalarRetTy, VF); + Type *RetTy = toVectorizedTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) - Tys.push_back(toVectorTy(ScalarTy, VF)); + Tys.push_back(toVectorizedTy(ScalarTy, VF)); // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. @@ -6460,7 +6474,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else - VectorTy = toVectorTy(RetTy, VF); + VectorTy = toVectorizedTy(RetTy, VF); if (VF.isVector() && VectorTy->isVectorTy() && !TTI.getNumberOfParts(VectorTy)) @@ -8582,7 +8596,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::Shl: case Instruction::Sub: case Instruction::Xor: - case Instruction::Freeze: + case Instruction::Freeze: { SmallVector NewOps(Operands); if (Instruction::isBinaryOp(I->getOpcode())) { // The legacy cost model uses SCEV to check if some of the operands are @@ -8607,6 +8621,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, NewOps[1] = GetConstantViaSCEV(NewOps[1]); } return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); + } + case Instruction::ExtractValue: { + SmallVector NewOps(Operands); + Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); + auto *EVI = cast(I); + assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); + unsigned Idx = EVI->getIndices()[0]; + NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); + } }; } @@ -9888,7 +9912,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { VectorType::get(UI->getType(), State.VF)); State.set(this, Poison); } - State.packScalarIntoVectorValue(this, *State.Lane); + State.packScalarIntoVectorizedValue(this, *State.Lane); } return; } @@ -10405,13 +10429,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasStructVectorCall()) { - reportVectorizationFailure("Auto-vectorization of calls that return struct " - "types is not yet supported", - "StructCallVectorizationUnsupported", ORE, L); - return false; - } - // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5a88ebeffb18b..c52cbca27de3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -336,10 +336,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { } else { // Initialize packing with insertelements to start from undef. assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); + Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); set(Def, Undef); for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - packScalarIntoVectorValue(Def, Lane); + packScalarIntoVectorizedValue(Def, Lane); VectorValue = get(Def); } Builder.restoreIP(OldIP); @@ -392,13 +392,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) { Builder.SetCurrentDebugLocation(DIL); } -void VPTransformState::packScalarIntoVectorValue(VPValue *Def, - const VPLane &Lane) { +void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def, + const VPLane &Lane) { Value *ScalarInst = get(Def, Lane); - Value *VectorValue = get(Def); - VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, - Lane.getAsRuntimeExpr(Builder, VF)); - set(Def, VectorValue); + Value *WideValue = get(Def); + Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF); + if (auto *StructTy = dyn_cast(WideValue->getType())) { + // We must handle each element of a vectorized struct type. + for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) { + Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I); + Value *VectorValue = Builder.CreateExtractValue(WideValue, I); + VectorValue = + Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr); + WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I); + } + } else { + WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr); + } + set(Def, WideValue); } BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 71fb6d42116cf..bf61251fc9133 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -125,6 +125,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) { case Instruction::FNeg: case Instruction::Freeze: return inferScalarType(R->getOperand(0)); + case Instruction::ExtractValue: { + assert(R->getNumOperands() == 2 && "expected single level extractvalue"); + auto *StructTy = cast(inferScalarType(R->getOperand(0))); + auto *CI = cast(R->getOperand(1)->getLiveInIRValue()); + return StructTy->getTypeAtIndex(CI->getZExtValue()); + } default: break; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 74713daf904f0..cd1ad9bec91f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -241,7 +241,7 @@ struct VPTransformState { set(Def, V, VPLane(0)); return; } - assert((VF.isScalar() || V->getType()->isVectorTy()) && + assert((VF.isScalar() || isVectorizedTy(V->getType())) && "scalar values must be stored as (0, 0)"); Data.VPV2Vector[Def] = V; } @@ -290,8 +290,9 @@ struct VPTransformState { /// Set the debug location in the builder using the debug location \p DL. void setDebugLocFrom(DebugLoc DL); - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); + /// Construct the vectorized value of a scalarized value \p V one lane at a + /// time. + void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane); /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c84a93d7398f7..410fd0c55831c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1129,7 +1129,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); SmallVector ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( @@ -1435,6 +1435,14 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.addMetadata(V, dyn_cast_or_null(getUnderlyingValue())); break; } + case Instruction::ExtractValue: { + assert(getNumOperands() == 2 && "expected single level extractvalue"); + Value *Op = State.get(getOperand(0)); + auto *CI = cast(getOperand(1)->getLiveInIRValue()); + Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue()); + State.set(this, Extract); + break; + } case Instruction::Freeze: { Value *Op = State.get(getOperand(0)); @@ -1536,6 +1544,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, Ctx.CostKind); } + case Instruction::ExtractValue: { + return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, + Ctx.CostKind); + } case Instruction::ICmp: case Instruction::FCmp: { Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll index 77781f95b0858..2fde624624ee9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll @@ -1,15 +1,18 @@ -; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s -; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s +; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s target triple = "aarch64-unknown-linux-gnu" ; Tests basic vectorization of scalable homogeneous struct literal returns. -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 +; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_A]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_B]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) entry: br label %for.body @@ -32,11 +35,15 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 +; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_A]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_B]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) entry: br label %for.body @@ -59,11 +66,16 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) +; CHECK: entry: +; CHECK: br i1 false, label %scalar.ph, label %vector.memcheck +; CHECK: vector.memcheck: +; CHECK: vector.body: +; CHECK: call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; CHECK: for.body: +; CHECK: call { float, float } @foo(float [[LOAD:%.*]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll new file mode 100644 index 0000000000000..c721493243734 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll @@ -0,0 +1,199 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|@)" --version 5 +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize < %s -S -o - 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST +; REQUIRES: asserts + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-COST-LABEL: struct_return_widen +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { half, half } @foo(half %in_val) +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 +; +; CHECK-COST: Cost of 10 for VF 2: WIDEN-CALL ir<%call> = call @foo(ir<%in_val>) (using library function: fixed_vec_foo) +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> + +define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @struct_return_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP2:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP3:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD1:%.*]]) +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR2:[0-9]+]] +; CHECK: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv + %in_val = load half, ptr %arrayidx, align 2 + %call = tail call { half, half } @foo(half %in_val) #0 + %extract_a = extractvalue { half, half } %call, 0 + %extract_b = extractvalue { half, half } %call, 1 + %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv + store half %extract_a, ptr %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv + store half %extract_b, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK-COST-LABEL: struct_return_replicate +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { half, half } @foo(half %in_val) +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 +; +; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> + +define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @struct_return_replicate( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP4:%.*]] = tail call { half, half } @foo(half [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] +; CHECK: [[TMP6:%.*]] = tail call { half, half } @foo(half [[TMP5:%.*]]) #[[ATTR3]] +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]] +; CHECK: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv + %in_val = load half, ptr %arrayidx, align 2 + ; #1 does not have a fixed-size vector mapping (so replication is used) + %call = tail call { half, half } @foo(half %in_val) #1 + %extract_a = extractvalue { half, half } %call, 0 + %extract_b = extractvalue { half, half } %call, 1 + %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv + store half %extract_a, ptr %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv + store half %extract_b, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK-COST-LABEL: struct_return_scalable +; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { half, half } @foo(half %in_val) +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 +; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 +; +; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; +; CHECK-COST: Cost of 10 for VF vscale x 8: WIDEN-CALL ir<%call> = call @foo(ir<%in_val>, ir) (using library function: scalable_vec_masked_foo) +; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> + +define void @struct_return_scalable(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) #2 { +; CHECK-LABEL: define void @struct_return_scalable( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK: [[ENTRY:.*:]] +; CHECK: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[VECTOR_PH:.*:]] +; CHECK: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[VECTOR_BODY:.*:]] +; CHECK: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[TMP12:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_LOAD:%.*]], splat (i1 true)) +; CHECK: [[TMP13:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_LOAD1:%.*]], splat (i1 true)) +; CHECK: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[MIDDLE_BLOCK:.*:]] +; CHECK: [[SCALAR_PH:.*:]] +; CHECK: [[FOR_BODY:.*:]] +; CHECK: [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]] +; CHECK: [[EXIT:.*:]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv + %in_val = load half, ptr %arrayidx, align 2 + %call = tail call { half, half } @foo(half %in_val) #1 + %extract_a = extractvalue { half, half } %call, 0 + %extract_b = extractvalue { half, half } %call, 1 + %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv + store half %extract_a, ptr %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv + store half %extract_b, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + + +declare { half, half } @foo(half) + +declare { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half>) +declare { , } @scalable_vec_masked_foo(, ) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" } +attributes #2 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index 9f98e8af2e98c..1b2a809a552d8 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -1,15 +1,20 @@ -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s ; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" ; Tests basic vectorization of homogeneous struct literal returns. -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported +; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1 +; CHECK: store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4 +; CHECK: store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4 entry: br label %for.body @@ -32,11 +37,16 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported +; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1 +; CHECK: store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8 +; CHECK: store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8 entry: br label %for.body @@ -59,11 +69,36 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported +; CHECK-REMARKS: remark: {{.*}} vectorized loop +; Note: Later instcombines reduce this down quite a lot. define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_replicate -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}}) +; CHECK: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}}) +; // Lane 0 +; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 +; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK: [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0 +; CHECK: [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1 +; CHECK: [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1 +; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0 +; CHECK: [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1 +; // Lane 1 +; CHECK: [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0 +; CHECK: [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0 +; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1 +; CHECK: [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0 +; CHECK: [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1 +; CHECK: [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1 +; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1 +; CHECK: [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1 +; // Store wide values: +; CHECK: [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0 +; CHECK: [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1 +; CHECK: store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4 +; CHECK: store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4 entry: br label %for.body @@ -87,11 +122,17 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported +; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-NOT: vector.body: +; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) +; CHECK: entry: +; CHECK: br i1 false, label %scalar.ph, label %vector.memcheck +; CHECK: vector.memcheck: +; CHECK: vector.body: +; CHECK: call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: for.body: +; CHECK call { float, float } @foo(float [[LOAD:%.*]]) entry: br label %for.body @@ -143,11 +184,11 @@ exit: ret void } -; TODO: Support vectorization in this case. -; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported +; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) { ; CHECK-LABEL: define void @struct_return_i32_three_results_widen -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK: call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD:%.*]]) entry: br label %for.body @@ -167,6 +208,40 @@ exit: ret void } +; Test crafted to exercise computePredInstDiscount with struct results +; (mainly it does not crash). +; CHECK-REMARKS: remark: {{.*}} vectorized loop +define void @scalarized_predicated_struct_return(ptr %a) optsize { +; CHECK-LABEL: define void @scalarized_predicated_struct_return +; CHECK: vector.body: +; CHECK: pred.store.if: +; CHECK: tail call { i64, i64 } @bar_i64(i64 %5) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv + %in_val = load i64, ptr %arrayidx, align 8 + %sgt_zero = icmp sgt i64 %in_val, 0 + br i1 %sgt_zero, label %if.then, label %for.inc + +if.then: + %call = tail call { i64, i64 } @bar_i64(i64 %in_val) #6 + %extract_a = extractvalue { i64, i64 } %call, 0 + %div = udiv i64 %extract_a, %in_val + store i64 %div, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + ; Negative test. Widening structs of vectors is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { @@ -390,13 +465,14 @@ declare { [2 x float] } @foo_arrays(float) declare { float, [1 x float] } @foo_one_non_widenable_element(float) declare { <1 x float>, <1 x float> } @foo_vectors(<1 x float>) declare { i32, i32, i32 } @qux(i32) +declare { i64, i64 } @bar_i64(i64) declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>) declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>) declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>) declare { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32>) - declare { , } @scalable_vec_masked_foo(, ) +declare { , } @scalable_vec_masked_bar_i64(, ) attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" } attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" } @@ -404,3 +480,4 @@ attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" } attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" } attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_qux(fixed_vec_qux)" } +attributes #6 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_bar_i64(scalable_vec_masked_bar_i64)" } diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll new file mode 100644 index 0000000000000..bb61398ae5a6d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll @@ -0,0 +1,122 @@ +; REQUIRES: asserts +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s + +define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_widen' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%in_val>) (using library function: fixed_vec_foo) +; CHECK-NEXT: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-NEXT: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a> +; CHECK-NEXT: CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4> +; CHECK-NEXT: WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %in_val) #0 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_replicate' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]> +; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-NEXT: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> +; CHECK-NEXT: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a> +; CHECK-NEXT: CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4> +; CHECK-NEXT: WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + ; #3 does not have a fixed-size vector mapping (so replication is used) + %call = tail call { float, float } @foo(float %in_val) #1 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + + +declare { float, float } @foo(float) + +declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>) +declare { , } @scalable_vec_masked_foo(, ) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }