diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 744e4e740cb21..9a916a663a64c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { return true; } +/// Test whether the specified input (0 or 1) is a broadcast/splat blended by +/// the given mask. +/// +static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef Mask, + int BroadcastableElement = 0) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && + Mask[i] % Size != BroadcastableElement) + return false; + return true; +} + /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. @@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); + bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask); + bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask); // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). @@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && (V1.getOpcode() != ISD::BUILD_VECTOR) && - (V2.getOpcode() != ISD::BUILD_VECTOR)) + (V2.getOpcode() != ISD::BUILD_VECTOR) && + (!Subtarget.hasAVX2() || + !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat)))) return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG); // If we have one input in place, then we can permute the other input and diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index 1baaab0931cb9..26a88ab15e3cc 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %r diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 5fe1e2996ee9b..e2cc3ae0dca0a 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index bdc1ff4c157e4..a38ca339cd5e1 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10 -; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4 +; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4 ; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3 ; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 +; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7 +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10 +; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11 -; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9 -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12 -; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9 -; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10 -; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 -; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10 -; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9 +; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3 -; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2 -; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5 +; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6 +; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2 +; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5 ; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 -; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] -; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1 +; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2 +; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3] +; AVX2-NEXT: vmovsd %xmm1, 64(%rdi) +; AVX2-NEXT: vmovapd %ymm0, 32(%rdi) +; AVX2-NEXT: vmovapd %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 79602a18693db..00af58544e25c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X86-AVX2-NEXT: vmovapd %ymm3, (%edx) ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3] @@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] ; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9] -; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6 -; X86-AVX512-NEXT: vmovapd %ymm6, (%edx) +; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 +; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X86-AVX512-NEXT: vmovapd %ymm4, (%edx) ; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] ; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 ; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) @@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; ; X64-AVX2-LABEL: PR48908: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3] @@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9] -; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9] -; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6 -; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi) -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1] -; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 -; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] +; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 +; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi) +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] +; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 +; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi) ; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11] ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]