diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index da03a69708ddf..da3c780550a08 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1395,60 +1395,91 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } -/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into -/// "binop (shuffle), (shuffle)". +/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { - auto *VecTy = cast(I.getType()); BinaryOperator *B0, *B1; - ArrayRef Mask; + ArrayRef OldMask; if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), - m_Mask(Mask))) || - B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) + m_Mask(OldMask)))) return false; // Don't introduce poison into div/rem. - if (any_of(Mask, [](int M) { return M == PoisonMaskElem; }) && + if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) && B0->isIntDivRem()) return false; - // Try to replace a binop with a shuffle if the shuffle is not costly. - // The new shuffle will choose from a single, common operand, so it may be - // cheaper than the existing two-operand shuffle. - SmallVector UnaryMask = createUnaryMask(Mask, Mask.size()); + // TODO: Add support for addlike etc. Instruction::BinaryOps Opcode = B0->getOpcode(); - InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy); - InstructionCost ShufCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask); - if (ShufCost > BinopCost) + if (Opcode != B1->getOpcode()) + return false; + + auto *ShuffleDstTy = dyn_cast(I.getType()); + auto *BinOpTy = dyn_cast(B0->getType()); + if (!ShuffleDstTy || !BinOpTy) return false; + unsigned NumSrcElts = BinOpTy->getNumElements(); + // If we have something like "add X, Y" and "add Z, X", swap ops to match. Value *X = B0->getOperand(0), *Y = B0->getOperand(1); Value *Z = B1->getOperand(0), *W = B1->getOperand(1); - if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) + if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W && + (X == W || Y == Z)) std::swap(X, Y); - Value *Shuf0, *Shuf1; + auto ConvertToUnary = [NumSrcElts](int &M) { + if (M >= (int)NumSrcElts) + M -= NumSrcElts; + }; + + SmallVector NewMask0(OldMask.begin(), OldMask.end()); + TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc; if (X == Z) { - // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) - Shuf0 = Builder.CreateShuffleVector(X, UnaryMask); - Shuf1 = Builder.CreateShuffleVector(Y, W, Mask); - } else if (Y == W) { - // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) - Shuf0 = Builder.CreateShuffleVector(X, Z, Mask); - Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask); - } else { - return false; + llvm::for_each(NewMask0, ConvertToUnary); + SK0 = TargetTransformInfo::SK_PermuteSingleSrc; + Z = PoisonValue::get(BinOpTy); } + SmallVector NewMask1(OldMask.begin(), OldMask.end()); + TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc; + if (Y == W) { + llvm::for_each(NewMask1, ConvertToUnary); + SK1 = TargetTransformInfo::SK_PermuteSingleSrc; + W = PoisonValue::get(BinOpTy); + } + + // Try to replace a binop with a shuffle if the shuffle is not costly. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) + + TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, + OldMask, CostKind, 0, nullptr, {B0, B1}, &I); + + InstructionCost NewCost = + TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) + + TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); + + LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + if (NewCost >= OldCost) + return false; + + Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); + Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); + // Intersect flags from the old binops. if (auto *NewInst = dyn_cast(NewBO)) { NewInst->copyIRFlags(B0); NewInst->andIRFlags(B1); } - // TODO: Add Shuf0/Shuf1 to WorkList? + Worklist.pushValue(Shuf0); + Worklist.pushValue(Shuf1); replaceValue(I, *NewBO); return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll index a19b205e68a62..c423053a9a483 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll @@ -25,9 +25,9 @@ define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x floa define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx( ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> -; CHECK-NEXT: [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]] +; CHECK-NEXT: [[B0:%.*]] = add <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = add <4 x i32> [[X]], [[Z]] +; CHECK-NEXT: [[R2:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R2]] ; %b0 = add <4 x i32> %x, %y @@ -36,15 +36,22 @@ define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ret <4 x i32> %r } -; For commutative instructions, common operand may be swapped. +; For commutative instructions, common operand may be swapped (SSE - expensive fmul vs AVX - cheap fmul) define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( -; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x float> [[R]] +; SSE-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( +; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: ret <4 x float> [[R]] +; +; AVX-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( +; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]] +; AVX-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]] +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> +; AVX-NEXT: ret <4 x float> [[R]] ; %b0 = fmul <4 x float> %x, %y %b1 = fmul <4 x float> %z, %x @@ -57,9 +64,9 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap( ; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> -; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[B0:%.*]] = and <2 x i64> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = and <2 x i64> [[Y]], [[Z]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> ; CHECK-NEXT: ret <2 x i64> [[R]] ; %b0 = and <2 x i64> %x, %y @@ -84,15 +91,22 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ret <4 x i32> %r } -; negative test - common operand, but not commutable +; common operand, but not commutable (SSE - expensive vector shift vs AVX2 - cheap vector shift) define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]] -; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R1]] +; SSE-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( +; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> +; SSE-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: ret <4 x i32> [[R]] +; +; AVX-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( +; AVX-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]] +; AVX-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]] +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R]] ; %b0 = shl <4 x i32> %x, %y %b1 = shl <4 x i32> %z, %x @@ -116,7 +130,7 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z ret <2 x i64> %r } -; negative test - type change via shuffle +; type change via shuffle define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) { ; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type( @@ -168,14 +182,14 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ret <4 x i32> %r } -; negative test - must have matching operand +; non-matching operands (not commutable) define <4 x float> @shuf_fdiv_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { ; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_no_common_op( ; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = fdiv <4 x float> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = fdiv <4 x float> [[Z]], [[W]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[W]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %b0 = fdiv <4 x float> %x, %y @@ -216,6 +230,3 @@ define <4 x i32> @shuf_srem_v4i32_poison(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %r } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; SSE: {{.*}}