diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 144f35e10132f..f76ef6a081dbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -177,8 +177,8 @@ class VPBuilder { Type *ResultTy, const VPIRFlags &Flags = {}, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { - return tryInsertInstruction( - new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name)); + return tryInsertInstruction(new VPInstructionWithType( + Opcode, Operands, ResultTy, Flags, DL, /*IsSingleScalar=*/false, Name)); } VPInstruction *createOverflowingOp(unsigned Opcode, @@ -275,8 +275,8 @@ class VPBuilder { VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL) { - return tryInsertInstruction( - new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL)); + return tryInsertInstruction(new VPInstructionWithType( + Opcode, Op, ResultTy, {}, DL, /*IsSingleScalar=*/true)); } VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f4259d3d69880..f0c38cb214f87 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8070,7 +8070,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc()); } -VPReplicateRecipe * +VPSingleDefRecipe * VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef Operands, VFRange &Range) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8128,6 +8128,14 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef Operands, assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || (Range.Start.isScalable() && isa(I))) && "Should not predicate a uniform recipe"); + if (IsUniform && Instruction::isCast(I->getOpcode())) { + assert(!IsPredicated && "IsUniform implies unpredicated"); + auto *Recipe = new VPInstructionWithType( + I->getOpcode(), Operands, I->getType(), VPIRFlags(*I), I->getDebugLoc(), + IsUniform, I->getName()); + Recipe->setUnderlyingValue(I); + return Recipe; + } auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask, VPIRMetadata(*I, LVer)); return Recipe; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 8369c78a2d78f..19735b26dae28 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -199,7 +199,7 @@ class VPRecipeBuilder { /// Build a VPReplicationRecipe for \p I using \p Operands. If it is /// predicated, add the mask as last operand. Range.End may be decreased to /// ensure same recipe behavior from \p Range.Start to \p Range.End. - VPReplicateRecipe *handleReplication(Instruction *I, + VPSingleDefRecipe *handleReplication(Instruction *I, ArrayRef Operands, VFRange &Range); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4163b0743a9a..4c59ba058597e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -919,6 +919,9 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<1> { friend class VPlanSlp; + /// True if the VPInstruction produces a single scalar value. + bool IsSingleScalar; + public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. enum { @@ -1009,7 +1012,7 @@ class VPInstruction : public VPRecipeWithIRFlags, VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL = {}, - const Twine &Name = ""); + const Twine &Name = "", bool IsSingleScalar = false); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) @@ -1096,8 +1099,9 @@ class VPInstructionWithType : public VPInstruction { public: VPInstructionWithType(unsigned Opcode, ArrayRef Operands, Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name = "") - : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {} + bool IsSingleScalar = false, const Twine &Name = "") + : VPInstruction(Opcode, Operands, Flags, DL, Name, IsSingleScalar), + ResultTy(ResultTy) {} static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1124,7 +1128,7 @@ class VPInstructionWithType : public VPInstruction { SmallVector Operands(operands()); auto *New = new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + getDebugLoc(), isSingleScalar(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -1133,10 +1137,7 @@ class VPInstructionWithType : public VPInstruction { /// Return the cost of this VPInstruction. InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override { - // TODO: Compute accurate cost after retiring the legacy cost model. - return 0; - } + VPCostContext &Ctx) const override; Type *getResultType() const { return ResultTy; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3e12fdf9163eb..c00d46bd3e579 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -410,9 +410,10 @@ template class VPUnrollPartAccessor<3>; VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name) + const Twine &Name, bool IsSingleScalar) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(), IsSingleScalar(IsSingleScalar), Opcode(Opcode), + Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); } @@ -866,7 +867,8 @@ bool VPInstruction::isVectorToScalar() const { } bool VPInstruction::isSingleScalar() const { - return getOpcode() == Instruction::PHI || isScalarCast(); + // TODO: Set IsSingleScalar for PHI. + return IsSingleScalar || getOpcode() == Instruction::PHI; } void VPInstruction::execute(VPTransformState &State) { @@ -1079,13 +1081,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, void VPInstructionWithType::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); - if (isScalarCast()) { + if (Instruction::isCast(getOpcode())) { Value *Op = State.get(getOperand(0), VPLane(0)); Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), Op, ResultTy); + if (auto *I = dyn_cast(Cast)) + applyFlags(*I); State.set(this, Cast, VPLane(0)); return; } + switch (getOpcode()) { case VPInstruction::StepVector: { Value *StepVector = @@ -1098,6 +1103,15 @@ void VPInstructionWithType::execute(VPTransformState &State) { } } +InstructionCost VPInstructionWithType::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // TODO: Compute cost for VPInstructions without underlying values once + // the legacy cost model has been retired. + if (!getUnderlyingValue()) + return 0; + return Ctx.getLegacyCost(cast(getUnderlyingValue()), VF); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -1643,12 +1657,13 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FSub || Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FPTrunc || Opcode == Instruction::FPExt || Opcode == Instruction::FCmp || Opcode == Instruction::Select || Opcode == VPInstruction::WideIVStep || Opcode == VPInstruction::ReductionStartVector || Opcode == VPInstruction::ComputeReductionResult; case OperationType::NonNegOp: - return Opcode == Instruction::ZExt; + return Opcode == Instruction::UIToFP || Opcode == Instruction::ZExt; break; case OperationType::Cmp: return Opcode == Instruction::ICmp; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ac6be09ef271d..3736b7ee1a924 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1031,8 +1031,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPSingleDefRecipe *VPC; + if (vputils::isSingleScalar(Def)) + VPC = new VPInstructionWithType(Instruction::CastOps(ExtOpcode), {A}, + TruncTy, {}, Def->getDebugLoc(), + /*IsSingleScalar=*/true); + else + VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, + TruncTy, {}, Def->getDebugLoc()); + if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. VPC->setUnderlyingValue(UnderlyingExt); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll index 444c6dd269425..9a61fee273cb5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll @@ -29,7 +29,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1>, [[VF]] ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: CLONE [[IDX:.*]] = load [[GEP_IDX]] -; CHECK-NEXT: CLONE [[EXT_IDX:.*]] = zext [[IDX]] +; CHECK-NEXT: EMIT-SCALAR [[EXT_IDX:.*]] = zext [[IDX]] ; CHECK-NEXT: CLONE [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: CLONE [[HISTVAL:.*]] = load [[GEP_BUCKET]] ; CHECK-NEXT: CLONE [[UPDATE:.*]] = add nsw [[HISTVAL]], ir<1> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b23b0ce759d49..f36ec61541624 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -58,7 +58,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb ] ; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb ] ; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> -; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> ; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> ; CHECK-NEXT: EMIT ir<%add9> = add ir<%1>, ir<1> @@ -101,7 +101,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> ; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> to i64 ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> ; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> ; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> @@ -235,7 +235,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] ; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> to i64 ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> ; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> ; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> @@ -467,7 +467,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb ] ; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb ] ; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> -; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> ; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> ; CHECK-NEXT: EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> @@ -510,7 +510,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> ; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> to i64 ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> ; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> ; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> @@ -644,7 +644,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] ; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> to i64 ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> ; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> ; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 05a495d51c458..59030dec6e02d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -17,8 +17,7 @@ define void @f1() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x ptr> , ptr [[TMP2]], align 8 ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll index 67aacefebd555..9ea8d4a679ab8 100644 --- a/llvm/test/Transforms/LoopVectorize/as_cast.ll +++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll @@ -11,18 +11,19 @@ loop: %arrayidx = getelementptr inbounds i64, ptr %ascast, i64 %next store i64 %next, ptr %arrayidx, align 4 -; check that we find the two interleaved blocks with ascast, gep and store: +; check that we find the loop-invariant ascast followed by two interleaved +; blocks with gep and store: +; CHECK: [[AS1:%.*]] = addrspacecast ptr addrspace(1) %in to ptr +; CHECK: vector.body: ; CHECK: pred.store.if: ; CHECK: [[ID1:%.*]] = add i64 %{{.*}}, 1 -; CHECK: [[AS1:%.*]] = addrspacecast ptr addrspace(1) %{{.*}} to ptr ; CHECK: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[AS1]], i64 [[ID1]] ; CHECK: store i64 [[ID1]], ptr [[GEP1]] ; CHECK: pred.store.if1: ; CHECK: [[ID2:%.*]] = add i64 %{{.*}}, 1 -; CHECK: [[AS2:%.*]] = addrspacecast ptr addrspace(1) %in to ptr -; CHECK: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[AS2]], i64 [[ID2]] -; CHECK: store i64 [[ID2]], ptr %9, align 4 +; CHECK: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[AS1]], i64 [[ID2]] +; CHECK: store i64 [[ID2]], ptr [[GEP2]], align 4 %cmp = icmp eq i64 %next, 7 br i1 %cmp, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index f03870096ca97..92dea7c2a55ec 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -346,12 +346,10 @@ define void @pr76986_trunc_sext_interleaving_only(i16 %arg, ptr noalias %src, pt ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr %src, i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = sext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP6]] to i16 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP7]] to i16 -; CHECK-NEXT: [[TMP10:%.*]] = sdiv i16 [[TMP8]], %arg -; CHECK-NEXT: [[TMP11:%.*]] = sdiv i16 [[TMP9]], %arg +; CHECK-NEXT: [[TMP6:%.*]] = sext i8 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = sext i8 [[TMP5]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i16 [[TMP6]], %arg +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i16 [[TMP7]], %arg ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr %dst, i64 [[INDEX]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr %dst, i64 [[TMP1]] ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP12]], align 2