Skip to content

[X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets #126420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
return true;
}

/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
/// the given mask.
///
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef<int> Mask,
int BroadcastableElement = 0) {
assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] / Size == Input &&
Mask[i] % Size != BroadcastableElement)
return false;
return true;
}

/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
Expand Down Expand Up @@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);

// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
Expand All @@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
(V2.getOpcode() != ISD::BUILD_VECTOR))
(V2.getOpcode() != ISD::BUILD_VECTOR) &&
(!Subtarget.hasAVX2() ||
!((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

// If we have one input in place, then we can permute the other input and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un
define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; CHECK-NEXT: retq
%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x double> %r
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/X86/horizontal-sum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
Expand All @@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: retq
%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
Expand Down
82 changes: 41 additions & 41 deletions llvm/test/CodeGen/X86/matrix-multiply.ll
Original file line number Diff line number Diff line change
Expand Up @@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10
; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4
; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10
; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9
; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3
; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3
; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7
; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9
; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10
; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11
; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9
; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12
; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9
; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7
; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10
; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10
; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11
; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7
; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4
; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1
; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3
; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6
; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1
; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2
; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6
; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2
; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5
; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3
; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
; AVX2-NEXT: vmovsd %xmm2, 64(%rdi)
; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
; AVX2-NEXT: vmovapd %ymm0, (%rdi)
; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1
; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2
; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
; AVX2-NEXT: vmovsd %xmm1, 64(%rdi)
; AVX2-NEXT: vmovapd %ymm0, 32(%rdi)
; AVX2-NEXT: vmovapd %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
Expand All @@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6
; X86-AVX512-NEXT: vmovapd %ymm6, (%edx)
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
; X86-AVX512-NEXT: vmovapd %ymm4, (%edx)
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
Expand Down Expand Up @@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
;
; X64-AVX2-LABEL: PR48908:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
Expand All @@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9]
; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6
; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1]
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
Expand Down