From ad591acf5eb8609692e6700cc0e5c66e49cf7035 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Thu, 3 Oct 2024 06:39:59 -0700 Subject: [PATCH 01/10] [SLP] Make getSameOpcode support different instructions if they have same semantics. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 237 +++++++++++++++--- .../SLPVectorizer/AArch64/vec3-base.ll | 8 +- ...reversed-strided-node-with-external-ptr.ll | 7 +- .../SLPVectorizer/RISCV/vec3-base.ll | 8 +- .../SLPVectorizer/X86/barriercall.ll | 4 +- .../X86/bottom-to-top-reorder.ll | 27 +- .../X86/extract-scalar-from-undef.ll | 27 +- .../SLPVectorizer/X86/extractcost.ll | 4 +- .../X86/minbitwidth-drop-wrapping-flags.ll | 4 +- .../X86/multi-extracts-bv-combined.ll | 4 +- .../Transforms/SLPVectorizer/X86/vec3-base.ll | 19 +- .../alternate-opcode-sindle-bv.ll | 36 ++- .../resized-alt-shuffle-after-minbw.ll | 62 +++-- .../SLPVectorizer/shuffle-mask-resized.ll | 4 +- 14 files changed, 313 insertions(+), 138 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ba70ab1e5e14b..99cb81f13a250 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -832,8 +832,107 @@ struct InstructionsState { : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} }; +struct InterchangeableInstruction { + unsigned Opcode; + SmallVector Ops; + template + InterchangeableInstruction(unsigned Opcode, ArgTypes &&...Args) + : Opcode(Opcode), Ops{std::forward(Args)...} {} +}; + +bool operator<(const InterchangeableInstruction &LHS, + const InterchangeableInstruction &RHS) { + return LHS.Opcode < RHS.Opcode; +} + } // end anonymous namespace +/// \returns a sorted list of interchangeable instructions by instruction opcode +/// that \p I can be converted to. +/// e.g., +/// x << y -> x * (2^y) +/// x << 1 -> x * 2 +/// x << 0 -> x * 1 -> x - 0 -> x + 0 -> x & 11...1 -> x | 0 +/// x * 0 -> x & 0 +/// x * -1 -> 0 - x +/// TODO: support more patterns +static SmallVector +getInterchangeableInstruction(Instruction *I) { + // PII = Possible Interchangeable Instruction + SmallVector PII; + unsigned Opcode = I->getOpcode(); + PII.emplace_back(Opcode, I->operands()); + if (!is_contained({Instruction::Shl, Instruction::Mul, Instruction::Sub, + Instruction::Add}, + Opcode)) + return PII; + Constant *C; + if (match(I, m_BinOp(m_Value(), m_Constant(C)))) { + ConstantInt *V = nullptr; + if (auto *CI = dyn_cast(C)) { + V = CI; + } else if (auto *CDV = dyn_cast(C)) { + if (auto *CI = dyn_cast_if_present(CDV->getSplatValue())) + V = CI; + } + if (!V) + return PII; + Value *Op0 = I->getOperand(0); + Type *Op1Ty = I->getOperand(1)->getType(); + const APInt &Op1Int = V->getValue(); + Constant *Zero = + ConstantInt::get(Op1Ty, APInt::getZero(Op1Int.getBitWidth())); + Constant *UnsignedMax = + ConstantInt::get(Op1Ty, APInt::getMaxValue(Op1Int.getBitWidth())); + switch (Opcode) { + case Instruction::Shl: { + PII.emplace_back(Instruction::Mul, Op0, + ConstantInt::get(Op1Ty, 1 << Op1Int.getZExtValue())); + if (Op1Int.isZero()) { + PII.emplace_back(Instruction::Sub, Op0, Zero); + PII.emplace_back(Instruction::Add, Op0, Zero); + PII.emplace_back(Instruction::And, Op0, UnsignedMax); + PII.emplace_back(Instruction::Or, Op0, Zero); + } + break; + } + case Instruction::Mul: { + switch (Op1Int.getSExtValue()) { + case 1: + PII.emplace_back(Instruction::Sub, Op0, Zero); + PII.emplace_back(Instruction::Add, Op0, Zero); + PII.emplace_back(Instruction::And, Op0, UnsignedMax); + PII.emplace_back(Instruction::Or, Op0, Zero); + break; + case 0: + PII.emplace_back(Instruction::And, Op0, Zero); + break; + case -1: + PII.emplace_back(Instruction::Sub, Zero, Op0); + break; + } + break; + } + case Instruction::Sub: + if (Op1Int.isZero()) { + PII.emplace_back(Instruction::Add, Op0, Zero); + PII.emplace_back(Instruction::And, Op0, UnsignedMax); + PII.emplace_back(Instruction::Or, Op0, Zero); + } + break; + case Instruction::Add: + if (Op1Int.isZero()) { + PII.emplace_back(Instruction::And, Op0, UnsignedMax); + PII.emplace_back(Instruction::Or, Op0, Zero); + } + break; + } + } + // std::set_intersection requires a sorted range. + sort(PII); + return PII; +} + /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. /// @@ -938,18 +1037,54 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } + // Currently, this is only used for binary ops. + // TODO: support all instructions + SmallVector InterchangeableOpcode = + getInterchangeableInstruction(cast(VL[BaseIndex])); + SmallVector AlternateInterchangeableOpcode; + auto UpdateInterchangeableOpcode = + [](SmallVector &LHS, + ArrayRef RHS) { + SmallVector NewInterchangeableOpcode; + std::set_intersection(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(), + std::back_inserter(NewInterchangeableOpcode)); + if (NewInterchangeableOpcode.empty()) + return false; + LHS = std::move(NewInterchangeableOpcode); + return true; + }; for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); unsigned InstOpcode = I->getOpcode(); if (IsBinOp && isa(I)) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + SmallVector ThisInterchangeableOpcode( + getInterchangeableInstruction(I)); + if (UpdateInterchangeableOpcode(InterchangeableOpcode, + ThisInterchangeableOpcode)) continue; - if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && - isValidForAlternation(Opcode)) { - AltOpcode = InstOpcode; - AltIndex = Cnt; + if (AlternateInterchangeableOpcode.empty()) { + InterchangeableOpcode.erase( + std::remove_if(InterchangeableOpcode.begin(), + InterchangeableOpcode.end(), + [](const InterchangeableInstruction &I) { + return !isValidForAlternation(I.Opcode); + }), + InterchangeableOpcode.end()); + ThisInterchangeableOpcode.erase( + std::remove_if(ThisInterchangeableOpcode.begin(), + ThisInterchangeableOpcode.end(), + [](const InterchangeableInstruction &I) { + return !isValidForAlternation(I.Opcode); + }), + ThisInterchangeableOpcode.end()); + if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + AlternateInterchangeableOpcode = std::move(ThisInterchangeableOpcode); continue; } + if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode, + ThisInterchangeableOpcode)) + continue; } else if (IsCastOp && isa(I)) { Value *Op0 = IBase->getOperand(0); Type *Ty0 = Op0->getType(); @@ -1043,6 +1178,21 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState(VL[BaseIndex], nullptr, nullptr); } + if (IsBinOp) { + auto FindOp = [&](ArrayRef CandidateOp) { + for (Value *V : VL) + for (const InterchangeableInstruction &I : CandidateOp) + if (cast(V)->getOpcode() == I.Opcode) + return cast(V); + llvm_unreachable( + "Cannot find the candidate instruction for InstructionsState."); + }; + Instruction *MainOp = FindOp(InterchangeableOpcode); + Instruction *AltOp = AlternateInterchangeableOpcode.empty() + ? MainOp + : FindOp(AlternateInterchangeableOpcode); + return InstructionsState(VL[BaseIndex], MainOp, AltOp); + } return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), cast(VL[AltIndex])); } @@ -2335,24 +2485,41 @@ class BoUpSLP { : cast(VL[0])->getNumOperands(); OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + InstructionsState S = getSameOpcode(VL, TLI); + for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx].resize(NumLanes); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - assert(isa(VL[Lane]) && "Expected instruction"); - // Our tree has just 3 nodes: the root and two operands. - // It is therefore trivial to get the APO. We only need to check the - // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or - // RHS operand. The LHS operand of both add and sub is never attached - // to an inversese operation in the linearized form, therefore its APO - // is false. The RHS is true only if VL[Lane] is an inverse operation. - - // Since operand reordering is performed on groups of commutative - // operations or alternating sequences (e.g., +, -), we can safely - // tell the inverse operations by checking commutativity. - bool IsInverseOperation = !isCommutative(cast(VL[Lane])); + for (auto [I, V] : enumerate(VL)) { + assert(isa(V) && "Expected instruction"); + SmallVector IIList = + getInterchangeableInstruction(cast(V)); + Value *SelectedOp; + auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == S.MainOp->getOpcode(); + }); + if (Iter == IIList.end()) { + Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == S.AltOp->getOpcode(); + }); + SelectedOp = S.AltOp; + } else { + SelectedOp = S.MainOp; + } + assert(Iter != IIList.end() && + "Cannot find an interchangeable instruction."); + // Our tree has just 3 nodes: the root and two operands. + // It is therefore trivial to get the APO. We only need to check the + // opcode of V and whether the operand at OpIdx is the LHS or RHS + // operand. The LHS operand of both add and sub is never attached to an + // inversese operation in the linearized form, therefore its APO is + // false. The RHS is true only if V is an inverse operation. + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + bool IsInverseOperation = !isCommutative(cast(SelectedOp)); + for (unsigned OpIdx : seq(NumOperands)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][Lane] = {cast(VL[Lane])->getOperand(OpIdx), - APO, false}; + OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false}; } } } @@ -3252,15 +3419,25 @@ class BoUpSLP { auto *I0 = cast(Scalars[0]); Operands.resize(I0->getNumOperands()); unsigned NumLanes = Scalars.size(); - for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); - OpIdx != NumOperands; ++OpIdx) { + unsigned NumOperands = I0->getNumOperands(); + for (unsigned OpIdx : seq(NumOperands)) Operands[OpIdx].resize(NumLanes); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - auto *I = cast(Scalars[Lane]); - assert(I->getNumOperands() == NumOperands && - "Expected same number of operands"); - Operands[OpIdx][Lane] = I->getOperand(OpIdx); - } + for (auto [I, V] : enumerate(Scalars)) { + SmallVector IIList = + getInterchangeableInstruction(cast(V)); + auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == MainOp->getOpcode(); + }); + if (Iter == IIList.end()) + Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == AltOp->getOpcode(); + }); + assert(Iter != IIList.end() && + "Cannot find an interchangeable instruction."); + assert(Iter->Ops.size() == NumOperands && + "Expected same number of operands"); + for (auto [J, Op] : enumerate(Iter->Ops)) + Operands[J][I] = Op; } } @@ -14935,7 +15112,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *V = Builder.CreateBinOp( static_cast(E->getOpcode()), LHS, RHS); - propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); + propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end()); if (auto *I = dyn_cast(V)) { V = propagateMetadata(I, E->Scalars); // Drop nuw flags for abs(sub(commutative), true). diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll index c18811a35c1ee..c7c999bb57285 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll @@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll index 9c1da08c64b7b..7bc03e7c7755b 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: -; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> , <2 x double> poison) ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 308d0e27f1ea8..e158c2a3ed87e 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index d388fd17925a1..59a6e5f4d0c6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) { ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll index 889f5a95c81d6..7af0c64f18748 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -4,22 +4,17 @@ define void @test(ptr %0, ptr %1, ptr %2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> , [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> , [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> , [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP2:%.*]], align 4 ; CHECK-NEXT: ret void ; %4 = load i32, ptr %1, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 6ff03acf85cdf..06f4b6e4521de 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,19 +4,20 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[TMP7:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> , <2 x i32> , i64 4) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP12]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll index 1374e9873e1c5..ac4603c9c88de 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -9,9 +9,7 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll index 2a5bfa7390770..daab4b6ea4c95 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll @@ -8,10 +8,8 @@ define i32 @test() { ; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll index e6a166c27ac49..94f2c79faa8c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll @@ -9,9 +9,7 @@ define i32 @foo() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[D]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 +; CHECK-NEXT: store <8 x i32> [[TMP2]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll index 96d4b84e03691..83391a96c5e34 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll @@ -242,13 +242,18 @@ exit: } define void @store_try_reorder(ptr %dst) { -; CHECK-LABEL: @store_try_reorder( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0 -; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @store_try_reorder( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @store_try_reorder( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %add = add i32 0, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index c250029519590..e4eff0f72b356 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -1,18 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define <2 x i32> @test(i32 %arg) { -; CHECK-LABEL: define <2 x i32> @test( -; CHECK-SAME: i32 [[ARG:%.*]]) { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 0, 1 -; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 -; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; X86-LABEL: define <2 x i32> @test( +; X86-SAME: i32 [[ARG:%.*]]) { +; X86-NEXT: bb: +; X86-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 +; X86-NEXT: [[MUL:%.*]] = mul i32 0, 1 +; X86-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] +; X86-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; X86-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 +; X86-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 +; X86-NEXT: ret <2 x i32> [[TMP1]] +; +; AARCH64-LABEL: define <2 x i32> @test( +; AARCH64-SAME: i32 [[ARG:%.*]]) { +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 +; AARCH64-NEXT: [[TMP1:%.*]] = or <2 x i32> [[TMP0]], zeroinitializer +; AARCH64-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; AARCH64-NEXT: [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; AARCH64-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; AARCH64-NEXT: ret <2 x i32> [[TMP1]] ; bb: %or = or i32 %arg, 0 @@ -23,4 +34,3 @@ bb: %1 = insertelement <2 x i32> %0, i32 %mul, i32 1 ret <2 x i32> %1 } - diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 56281424c7114..bcca8ba53016d 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -5,15 +5,13 @@ define void @func(i32 %0) { ; CHECK-LABEL: define void @func( ; CHECK-SAME: i32 [[TMP0:%.*]]) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> , <32 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16) @@ -24,61 +22,61 @@ define void @func(i32 %0) { ; CHECK-NEXT: [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64> ; CHECK-NEXT: [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31 -; CHECK-NEXT: [[TMP22:%.*]] = and i1 false, [[TMP21]] +; CHECK-NEXT: [[TMP76:%.*]] = and i1 false, [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i1> [[TMP20]], i32 30 -; CHECK-NEXT: [[TMP24:%.*]] = and i1 false, [[TMP23]] +; CHECK-NEXT: [[TMP22:%.*]] = and i1 false, [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <32 x i1> [[TMP20]], i32 29 -; CHECK-NEXT: [[TMP26:%.*]] = and i1 false, [[TMP25]] +; CHECK-NEXT: [[TMP24:%.*]] = and i1 false, [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP20]], i32 28 -; CHECK-NEXT: [[TMP28:%.*]] = and i1 false, [[TMP27]] +; CHECK-NEXT: [[TMP26:%.*]] = and i1 false, [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <32 x i1> [[TMP20]], i32 27 -; CHECK-NEXT: [[TMP30:%.*]] = and i1 false, [[TMP29]] +; CHECK-NEXT: [[TMP28:%.*]] = and i1 false, [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <32 x i1> [[TMP20]], i32 26 -; CHECK-NEXT: [[TMP32:%.*]] = and i1 false, [[TMP31]] +; CHECK-NEXT: [[TMP30:%.*]] = and i1 false, [[TMP31]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <32 x i1> [[TMP20]], i32 25 -; CHECK-NEXT: [[TMP34:%.*]] = and i1 false, [[TMP33]] +; CHECK-NEXT: [[TMP32:%.*]] = and i1 false, [[TMP33]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i1> [[TMP20]], i32 24 -; CHECK-NEXT: [[TMP36:%.*]] = and i1 false, [[TMP35]] +; CHECK-NEXT: [[TMP34:%.*]] = and i1 false, [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <32 x i1> [[TMP20]], i32 23 -; CHECK-NEXT: [[TMP38:%.*]] = and i1 false, [[TMP37]] +; CHECK-NEXT: [[TMP36:%.*]] = and i1 false, [[TMP37]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <32 x i1> [[TMP20]], i32 22 -; CHECK-NEXT: [[TMP40:%.*]] = and i1 false, [[TMP39]] +; CHECK-NEXT: [[TMP38:%.*]] = and i1 false, [[TMP39]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <32 x i1> [[TMP20]], i32 21 -; CHECK-NEXT: [[TMP42:%.*]] = and i1 false, [[TMP41]] +; CHECK-NEXT: [[TMP40:%.*]] = and i1 false, [[TMP41]] ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i1> [[TMP20]], i32 20 -; CHECK-NEXT: [[TMP44:%.*]] = and i1 false, [[TMP43]] +; CHECK-NEXT: [[TMP42:%.*]] = and i1 false, [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <32 x i1> [[TMP20]], i32 19 -; CHECK-NEXT: [[TMP46:%.*]] = and i1 false, [[TMP45]] +; CHECK-NEXT: [[TMP44:%.*]] = and i1 false, [[TMP45]] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <32 x i1> [[TMP20]], i32 18 -; CHECK-NEXT: [[TMP48:%.*]] = and i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP46:%.*]] = and i1 false, [[TMP47]] ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <32 x i1> [[TMP20]], i32 17 -; CHECK-NEXT: [[TMP50:%.*]] = and i1 false, [[TMP49]] +; CHECK-NEXT: [[TMP48:%.*]] = and i1 false, [[TMP49]] ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <32 x i1> [[TMP20]], i32 16 -; CHECK-NEXT: [[TMP52:%.*]] = and i1 false, [[TMP51]] +; CHECK-NEXT: [[TMP50:%.*]] = and i1 false, [[TMP51]] ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <32 x i1> [[TMP20]], i32 15 -; CHECK-NEXT: [[TMP54:%.*]] = and i1 false, [[TMP53]] +; CHECK-NEXT: [[TMP52:%.*]] = and i1 false, [[TMP53]] ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <32 x i1> [[TMP20]], i32 14 -; CHECK-NEXT: [[TMP56:%.*]] = and i1 false, [[TMP55]] +; CHECK-NEXT: [[TMP54:%.*]] = and i1 false, [[TMP55]] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <32 x i1> [[TMP20]], i32 13 -; CHECK-NEXT: [[TMP58:%.*]] = and i1 false, [[TMP57]] +; CHECK-NEXT: [[TMP56:%.*]] = and i1 false, [[TMP57]] ; CHECK-NEXT: [[TMP59:%.*]] = extractelement <32 x i1> [[TMP20]], i32 12 -; CHECK-NEXT: [[TMP60:%.*]] = and i1 false, [[TMP59]] +; CHECK-NEXT: [[TMP58:%.*]] = and i1 false, [[TMP59]] ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <32 x i1> [[TMP20]], i32 11 -; CHECK-NEXT: [[TMP62:%.*]] = and i1 false, [[TMP61]] +; CHECK-NEXT: [[TMP60:%.*]] = and i1 false, [[TMP61]] ; CHECK-NEXT: [[TMP63:%.*]] = extractelement <32 x i1> [[TMP20]], i32 10 -; CHECK-NEXT: [[TMP64:%.*]] = and i1 false, [[TMP63]] +; CHECK-NEXT: [[TMP62:%.*]] = and i1 false, [[TMP63]] ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <32 x i1> [[TMP20]], i32 9 -; CHECK-NEXT: [[TMP66:%.*]] = and i1 false, [[TMP65]] +; CHECK-NEXT: [[TMP64:%.*]] = and i1 false, [[TMP65]] ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <32 x i1> [[TMP20]], i32 8 -; CHECK-NEXT: [[TMP68:%.*]] = and i1 false, [[TMP67]] +; CHECK-NEXT: [[TMP66:%.*]] = and i1 false, [[TMP67]] ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <32 x i1> [[TMP20]], i32 7 -; CHECK-NEXT: [[TMP70:%.*]] = and i1 false, [[TMP69]] +; CHECK-NEXT: [[TMP68:%.*]] = and i1 false, [[TMP69]] ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <32 x i1> [[TMP20]], i32 6 -; CHECK-NEXT: [[TMP72:%.*]] = and i1 false, [[TMP71]] +; CHECK-NEXT: [[TMP70:%.*]] = and i1 false, [[TMP71]] ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <32 x i1> [[TMP20]], i32 5 -; CHECK-NEXT: [[TMP74:%.*]] = and i1 false, [[TMP73]] +; CHECK-NEXT: [[TMP72:%.*]] = and i1 false, [[TMP73]] ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4 -; CHECK-NEXT: [[TMP76:%.*]] = and i1 false, [[TMP75]] +; CHECK-NEXT: [[TMP74:%.*]] = and i1 false, [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0 ; CHECK-NEXT: [[TMP78:%.*]] = sext i32 [[TMP77]] to i64 ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]] diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll index 732b50396a460..1e3255f2187af 100644 --- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll @@ -12,9 +12,7 @@ define i32 @test() { ; CHECK-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP5]] = or <2 x i32> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4: ; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ] From f3935909ec19ac27e432da8eb8bcbad72fe5d752 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Mon, 21 Oct 2024 00:35:40 -0700 Subject: [PATCH 02/10] getSExtValue may use too many bits --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 99cb81f13a250..d743a33057d15 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -897,19 +897,15 @@ getInterchangeableInstruction(Instruction *I) { break; } case Instruction::Mul: { - switch (Op1Int.getSExtValue()) { - case 1: + if (Op1Int.isOne()) { PII.emplace_back(Instruction::Sub, Op0, Zero); PII.emplace_back(Instruction::Add, Op0, Zero); PII.emplace_back(Instruction::And, Op0, UnsignedMax); PII.emplace_back(Instruction::Or, Op0, Zero); - break; - case 0: + } else if (Op1Int.isZero()) { PII.emplace_back(Instruction::And, Op0, Zero); - break; - case -1: + } else if (Op1Int.isAllOnes()) { PII.emplace_back(Instruction::Sub, Zero, Op0); - break; } break; } From 83ed351211e4fa9b02b32b736793c8f212af26f5 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Mon, 21 Oct 2024 03:41:04 -0700 Subject: [PATCH 03/10] apply comment --- .../Transforms/Vectorize/SLPVectorizer.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d743a33057d15..a55d9a3dea619 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1046,7 +1046,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, std::back_inserter(NewInterchangeableOpcode)); if (NewInterchangeableOpcode.empty()) return false; - LHS = std::move(NewInterchangeableOpcode); + LHS.swap(NewInterchangeableOpcode); return true; }; for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { @@ -1060,22 +1060,20 @@ static InstructionsState getSameOpcode(ArrayRef VL, continue; if (AlternateInterchangeableOpcode.empty()) { InterchangeableOpcode.erase( - std::remove_if(InterchangeableOpcode.begin(), - InterchangeableOpcode.end(), - [](const InterchangeableInstruction &I) { - return !isValidForAlternation(I.Opcode); - }), + remove_if(InterchangeableOpcode, + [](const InterchangeableInstruction &I) { + return !isValidForAlternation(I.Opcode); + }), InterchangeableOpcode.end()); ThisInterchangeableOpcode.erase( - std::remove_if(ThisInterchangeableOpcode.begin(), - ThisInterchangeableOpcode.end(), - [](const InterchangeableInstruction &I) { - return !isValidForAlternation(I.Opcode); - }), + remove_if(ThisInterchangeableOpcode, + [](const InterchangeableInstruction &I) { + return !isValidForAlternation(I.Opcode); + }), ThisInterchangeableOpcode.end()); if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); - AlternateInterchangeableOpcode = std::move(ThisInterchangeableOpcode); + AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode); continue; } if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode, From 9672f6d3e36c5135f7387e58ee21aa82a6f384b6 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 22 Oct 2024 23:40:04 -0700 Subject: [PATCH 04/10] reduce repeated code --- .../Transforms/Vectorize/SLPVectorizer.cpp | 61 +++++++++---------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a55d9a3dea619..b82b15a81f997 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -929,6 +929,29 @@ getInterchangeableInstruction(Instruction *I) { return PII; } +/// \returns the Op and operands which \p I convert to. +static std::pair> +getInterchangeableInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp) { + SmallVector IIList = + getInterchangeableInstruction(I); + auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == MainOp->getOpcode(); + }); + Value *SelectedOp; + if (Iter == IIList.end()) { + Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + return II.Opcode == AltOp->getOpcode(); + }); + assert(Iter != IIList.end() && + "Cannot find an interchangeable instruction."); + SelectedOp = AltOp; + } else { + SelectedOp = MainOp; + } + return std::make_pair(SelectedOp, Iter->Ops); +} + /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. /// @@ -2484,22 +2507,8 @@ class BoUpSLP { OpsVec[OpIdx].resize(NumLanes); for (auto [I, V] : enumerate(VL)) { assert(isa(V) && "Expected instruction"); - SmallVector IIList = - getInterchangeableInstruction(cast(V)); - Value *SelectedOp; - auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { - return II.Opcode == S.MainOp->getOpcode(); - }); - if (Iter == IIList.end()) { - Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { - return II.Opcode == S.AltOp->getOpcode(); - }); - SelectedOp = S.AltOp; - } else { - SelectedOp = S.MainOp; - } - assert(Iter != IIList.end() && - "Cannot find an interchangeable instruction."); + auto [SelectedOp, Ops] = getInterchangeableInstruction( + cast(V), S.MainOp, S.AltOp); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2513,7 +2522,7 @@ class BoUpSLP { bool IsInverseOperation = !isCommutative(cast(SelectedOp)); for (unsigned OpIdx : seq(NumOperands)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false}; + OpsVec[OpIdx][I] = {Ops[OpIdx], APO, false}; } } } @@ -3417,20 +3426,10 @@ class BoUpSLP { for (unsigned OpIdx : seq(NumOperands)) Operands[OpIdx].resize(NumLanes); for (auto [I, V] : enumerate(Scalars)) { - SmallVector IIList = - getInterchangeableInstruction(cast(V)); - auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { - return II.Opcode == MainOp->getOpcode(); - }); - if (Iter == IIList.end()) - Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { - return II.Opcode == AltOp->getOpcode(); - }); - assert(Iter != IIList.end() && - "Cannot find an interchangeable instruction."); - assert(Iter->Ops.size() == NumOperands && - "Expected same number of operands"); - for (auto [J, Op] : enumerate(Iter->Ops)) + auto [SelectedOp, Ops] = + getInterchangeableInstruction(cast(V), MainOp, AltOp); + assert(Ops.size() == NumOperands && "Expected same number of operands"); + for (auto [J, Op] : enumerate(Ops)) Operands[J][I] = Op; } } From 10977812eef36e31162664d76ff0bbeb113539a2 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Fri, 6 Dec 2024 01:35:35 -0800 Subject: [PATCH 05/10] fix conflict --- .../Transforms/Vectorize/SLPVectorizer.cpp | 73 ++++--------------- ...reversed-strided-node-with-external-ptr.ll | 17 ++--- .../SLPVectorizer/X86/barriercall.ll | 11 +-- .../X86/extract-scalar-from-undef.ll | 43 +++-------- .../SLPVectorizer/X86/extractcost.ll | 15 +--- 5 files changed, 40 insertions(+), 119 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bfa4ac5a22a22..fe660b1f81714 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1072,11 +1072,11 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) return InstructionsState::invalid(); } -<<<<<<< HEAD + bool AnyPoison = InstCnt != VL.size(); // Currently, this is only used for binary ops. // TODO: support all instructions SmallVector InterchangeableOpcode = - getInterchangeableInstruction(cast(VL[BaseIndex])); + getInterchangeableInstruction(cast(V)); SmallVector AlternateInterchangeableOpcode; auto UpdateInterchangeableOpcode = [](SmallVector &LHS, @@ -1089,9 +1089,6 @@ static InstructionsState getSameOpcode(ArrayRef VL, LHS.swap(NewInterchangeableOpcode); return true; }; -======= - bool AnyPoison = InstCnt != VL.size(); ->>>>>>> upstream/main for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = dyn_cast(VL[Cnt]); if (!I) @@ -1123,7 +1120,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, }), ThisInterchangeableOpcode.end()); if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty()) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode); continue; } @@ -1230,7 +1227,6 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState::invalid(); } -<<<<<<< HEAD if (IsBinOp) { auto FindOp = [&](ArrayRef CandidateOp) { for (Value *V : VL) @@ -1244,12 +1240,9 @@ static InstructionsState getSameOpcode(ArrayRef VL, Instruction *AltOp = AlternateInterchangeableOpcode.empty() ? MainOp : FindOp(AlternateInterchangeableOpcode); - return InstructionsState(VL[BaseIndex], MainOp, AltOp); + return InstructionsState(MainOp, AltOp); } - return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), -======= return InstructionsState(cast(V), ->>>>>>> upstream/main cast(VL[AltIndex])); } @@ -2593,11 +2586,18 @@ class BoUpSLP { InstructionsState S = getSameOpcode(VL, TLI); for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx].resize(NumLanes); -<<<<<<< HEAD - for (auto [I, V] : enumerate(VL)) { - assert(isa(V) && "Expected instruction"); + for (auto [Lane, V] : enumerate(VL)) { + assert((isa(V) || isa(V)) && + "Expected instruction or poison value"); + if (isa(V)) { + for (unsigned OpIdx : seq(NumOperands)) + OpsVec[OpIdx][Lane] = { + PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, + false}; + continue; + } auto [SelectedOp, Ops] = getInterchangeableInstruction( - cast(V), S.MainOp, S.AltOp); + cast(V), S.getMainOp(), S.getAltOp()); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2610,30 +2610,8 @@ class BoUpSLP { // tell the inverse operations by checking commutativity. bool IsInverseOperation = !isCommutative(cast(SelectedOp)); for (unsigned OpIdx : seq(NumOperands)) { -======= - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - assert((isa(VL[Lane]) || isa(VL[Lane])) && - "Expected instruction or poison value"); - // Our tree has just 3 nodes: the root and two operands. - // It is therefore trivial to get the APO. We only need to check the - // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or - // RHS operand. The LHS operand of both add and sub is never attached - // to an inversese operation in the linearized form, therefore its APO - // is false. The RHS is true only if VL[Lane] is an inverse operation. - - // Since operand reordering is performed on groups of commutative - // operations or alternating sequences (e.g., +, -), we can safely - // tell the inverse operations by checking commutativity. - if (isa(VL[Lane])) { - OpsVec[OpIdx][Lane] = { - PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, - false}; - continue; - } - bool IsInverseOperation = !isCommutative(cast(VL[Lane])); ->>>>>>> upstream/main bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][I] = {Ops[OpIdx], APO, false}; + OpsVec[OpIdx][Lane] = {Ops[OpIdx], APO, false}; } } } @@ -3534,24 +3512,6 @@ class BoUpSLP { copy(OpVL, Operands[OpIdx].begin()); } -<<<<<<< HEAD - /// Set the operands of this bundle in their original order. - void setOperandsInOrder() { - assert(Operands.empty() && "Already initialized?"); - auto *I0 = cast(Scalars[0]); - Operands.resize(I0->getNumOperands()); - unsigned NumLanes = Scalars.size(); - unsigned NumOperands = I0->getNumOperands(); - for (unsigned OpIdx : seq(NumOperands)) - Operands[OpIdx].resize(NumLanes); - for (auto [I, V] : enumerate(Scalars)) { - auto [SelectedOp, Ops] = - getInterchangeableInstruction(cast(V), MainOp, AltOp); - assert(Ops.size() == NumOperands && "Expected same number of operands"); - for (auto [J, Op] : enumerate(Ops)) - Operands[J][I] = Op; - } -======= /// Set this bundle's operand from Scalars. void setOperand(const BoUpSLP &R, bool RequireReorder = false) { VLOperands Ops(Scalars, MainOp, R); @@ -3559,7 +3519,6 @@ class BoUpSLP { Ops.reorder(); for (unsigned I : seq(MainOp->getNumOperands())) setOperand(I, Ops.getVL(I)); ->>>>>>> upstream/main } /// Reorders operands of the node to the given mask \p Mask. diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll index 8b768ec4ca83a..74d7f1c91f3bf 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -12,18 +12,13 @@ define void @test(ptr %a, i64 %0) { ; CHECK: [[BB]]: ; CHECK-NEXT: [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] -<<<<<<< HEAD -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> , <2 x double> poison) -======= -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison) ->>>>>>> upstream/main -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison) +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = fsub <2 x double> [[TMP7]], [[TMP10]] -; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP11]], ptr align 8 [[ARRAYIDX17_I28_1]], i64 -8, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP12:%.*]] = fsub <2 x double> [[TMP9]], [[TMP11]] +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP12]], ptr align 8 [[TMP8]], i64 -8, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: br label %[[BB]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index fd52e01aa37be..e3b4898e85212 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -10,15 +10,8 @@ define i32 @foo(ptr nocapture %A, i32 %n) { ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -<<<<<<< HEAD -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], -======= -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ->>>>>>> upstream/main +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret i32 undef ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 151185c6899e9..119882f6ced1f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,38 +4,19 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -<<<<<<< HEAD -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[TMP7:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> , <2 x i32> , i64 4) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 undef, i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> zeroinitializer, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP12]], 0 -======= -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP5:%.*]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 ->>>>>>> upstream/main +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> , i32 undef, i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll index a22259369ce4f..fc62b0b38fd53 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -9,17 +9,10 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -<<<<<<< HEAD -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], -======= -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ->>>>>>> upstream/main -; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9) +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]] ; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]] ; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]] From 999f45a44b48ea788b4d85e35556366fa71e4fe9 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Fri, 6 Dec 2024 01:43:54 -0800 Subject: [PATCH 06/10] fix VL may contain PoisonValue --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fe660b1f81714..c46dd65f22968 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1229,10 +1229,13 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (IsBinOp) { auto FindOp = [&](ArrayRef CandidateOp) { - for (Value *V : VL) + for (Value *V : VL) { + if (!isa(V)) + continue; for (const InterchangeableInstruction &I : CandidateOp) if (cast(V)->getOpcode() == I.Opcode) return cast(V); + } llvm_unreachable( "Cannot find the candidate instruction for InstructionsState."); }; From ee74f11629e10ee5b2c9f212b3df62e2f6b70add Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 11 Dec 2024 01:06:32 -0800 Subject: [PATCH 07/10] appendOperandsOfVL does not have to call getSameOpcode. Instead, we pass MainOp and AltOp. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c46dd65f22968..e9e0bf1641de9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2575,18 +2575,18 @@ class BoUpSLP { } /// Go through the instructions in VL and append their operands. - void appendOperandsOfVL(ArrayRef VL, Instruction *VL0) { + void appendOperandsOfVL(ArrayRef VL, Instruction *MainOp, + Instruction *AltOp) { assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); // IntrinsicInst::isCommutative returns true if swapping the first "two" // arguments to the intrinsic produces the same result. constexpr unsigned IntrinsicNumOperands = 2; - unsigned NumOperands = VL0->getNumOperands(); - ArgSize = isa(VL0) ? IntrinsicNumOperands : NumOperands; + unsigned NumOperands = MainOp->getNumOperands(); + ArgSize = isa(MainOp) ? IntrinsicNumOperands : NumOperands; OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); - InstructionsState S = getSameOpcode(VL, TLI); for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx].resize(NumLanes); for (auto [Lane, V] : enumerate(VL)) { @@ -2595,12 +2595,12 @@ class BoUpSLP { if (isa(V)) { for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx][Lane] = { - PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, + PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true, false}; continue; } - auto [SelectedOp, Ops] = getInterchangeableInstruction( - cast(V), S.getMainOp(), S.getAltOp()); + auto [SelectedOp, Ops] = + getInterchangeableInstruction(cast(V), MainOp, AltOp); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2721,11 +2721,12 @@ class BoUpSLP { public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, Instruction *VL0, const BoUpSLP &R) + VLOperands(ArrayRef RootVL, Instruction *MainOp, + Instruction *AltOp, const BoUpSLP &R) : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), - L(R.LI->getLoopFor((VL0->getParent()))) { + L(R.LI->getLoopFor(MainOp->getParent())) { // Append all the operands of RootVL. - appendOperandsOfVL(RootVL, VL0); + appendOperandsOfVL(RootVL, MainOp, AltOp); } /// \Returns a value vector with the operands across all lanes for the @@ -3517,7 +3518,7 @@ class BoUpSLP { /// Set this bundle's operand from Scalars. void setOperand(const BoUpSLP &R, bool RequireReorder = false) { - VLOperands Ops(Scalars, MainOp, R); + VLOperands Ops(Scalars, MainOp, AltOp, R); if (RequireReorder) Ops.reorder(); for (unsigned I : seq(MainOp->getNumOperands())) @@ -8733,7 +8734,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; - VLOperands Ops(VL, VL0, *this); + VLOperands Ops(VL, VL0, S.getAltOp(), *this); if (cast(VL0)->isCommutative()) { // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. From ddf7ab48ccdb29103fa1bbbc139121ddeb5f4394 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 11 Dec 2024 01:08:46 -0800 Subject: [PATCH 08/10] refactor getInterchangeableInstruction --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e9e0bf1641de9..a20bf01446d3c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -946,18 +946,15 @@ getInterchangeableInstruction(Instruction *I, Instruction *MainOp, auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { return II.Opcode == MainOp->getOpcode(); }); - Value *SelectedOp; if (Iter == IIList.end()) { Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { return II.Opcode == AltOp->getOpcode(); }); assert(Iter != IIList.end() && "Cannot find an interchangeable instruction."); - SelectedOp = AltOp; - } else { - SelectedOp = MainOp; + return std::make_pair(AltOp, Iter->Ops); } - return std::make_pair(SelectedOp, Iter->Ops); + return std::make_pair(MainOp, Iter->Ops); } /// \returns true if \p Opcode is allowed as part of the main/alternate From 7c40025b78c75bfd00ad9d7edc489949c878b002 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 11 Dec 2024 21:08:18 -0800 Subject: [PATCH 09/10] apply comment --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a20bf01446d3c..5786c89a564e3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -943,7 +943,7 @@ getInterchangeableInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp) { SmallVector IIList = getInterchangeableInstruction(I); - auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { + const auto *Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { return II.Opcode == MainOp->getOpcode(); }); if (Iter == IIList.end()) { @@ -1227,7 +1227,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (IsBinOp) { auto FindOp = [&](ArrayRef CandidateOp) { for (Value *V : VL) { - if (!isa(V)) + if (isa(V)) continue; for (const InterchangeableInstruction &I : CandidateOp) if (cast(V)->getOpcode() == I.Opcode) From 4752ab65c40392537d2a4c7f5ce648dcdc13542f Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 11 Dec 2024 23:00:33 -0800 Subject: [PATCH 10/10] replace undef with poison --- .../X86/extract-scalar-from-undef.ll | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 119882f6ced1f..d474a5f2cecae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,19 +4,17 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> , <2 x i32> , i64 4) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> , i32 undef, i32 6 -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP7:%.*]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> , <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP9]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; @@ -27,7 +25,7 @@ bb: %tmp4 = xor i32 %tmp3, 0 %tmp6 = sub i32 0, 0 %tmp8 = sub i32 %tmp7, 0 - %tmp9 = sub nsw i32 0, undef + %tmp9 = sub nsw i32 0, poison %tmp10 = add nsw i32 0, %tmp6 %tmp11 = sub nsw i32 0, %tmp8 %tmp12 = add i32 0, %tmp10 @@ -42,10 +40,10 @@ bb: %tmp21 = add i32 %tmp20, %tmp17 %tmp22 = sub i32 0, 0 %tmp23 = add i32 0, 0 - %tmp24 = sub i32 undef, 0 - %tmp25 = add nsw i32 %tmp23, undef + %tmp24 = sub i32 poison, 0 + %tmp25 = add nsw i32 %tmp23, poison %tmp26 = add nsw i32 %tmp24, %tmp22 - %tmp27 = sub nsw i32 undef, %tmp24 + %tmp27 = sub nsw i32 poison, %tmp24 %tmp28 = add i32 0, %tmp25 %tmp29 = xor i32 %tmp28, 0 %tmp30 = add i32 0, %tmp26 @@ -56,7 +54,7 @@ bb: %tmp35 = add i32 %tmp34, %tmp29 %tmp36 = add i32 %tmp35, 0 %tmp37 = add i32 %tmp36, %tmp33 - %tmp38 = sub nsw i32 0, undef + %tmp38 = sub nsw i32 0, poison %tmp39 = add i32 0, %tmp38 %tmp40 = xor i32 %tmp39, 0 %tmp41 = add i32 0, %tmp37