-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[X86] Lower vXi8 multiplies by constant using PMADDUBSW on SSSE3+ targets #95403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAs discussed on #90748 - we can avoid unpacks/extensions vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. This patch limits the lowering to multiply by constants, but most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/5 contention. Patch is 141.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95403.diff 18 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 88c7a4159856a..a59cd712c4f9e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28493,6 +28493,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// vector pairs, multiply and truncate.
if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
@@ -28506,6 +28508,31 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ // For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension.
+ // Don't do this if we only need to unpack one half.
+ if (Subtarget.hasSSSE3() &&
+ ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ bool IsLoLaneZeroOrUndef = true;
+ bool IsHiLaneZeroOrUndef = true;
+ for (auto [Idx, Val] : enumerate(B->ops())) {
+ if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
+ IsHiLaneZeroOrUndef &= isNullConstantOrUndef(Val);
+ else
+ IsLoLaneZeroOrUndef &= isNullConstantOrUndef(Val);
+ }
+ if (!(IsLoLaneZeroOrUndef || IsHiLaneZeroOrUndef)) {
+ SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
+ SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
+ SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
+ SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
+ SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
+ RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
+ DAG.getTargetConstant(8, dl, MVT::i8));
+ return VT, DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
+ }
+ }
+
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 5d7bf4a2c9788..18e9cf7da110e 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -541,15 +541,12 @@ define i64 @combine_mul_smul_lohi_const_i64(i64 %h) {
define <16 x i8> @PR35579(<16 x i8> %x) {
; SSE-LABEL: PR35579:
; SSE: # %bb.0:
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: packuswb %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psllw $8, %xmm1
+; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR35579:
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index 6232488bea71b..c2048bccc82e1 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -385,27 +385,21 @@ define <16 x i8> @splatvar_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; GFNISSE-NEXT: pand %xmm2, %xmm0
-; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT: pand %xmm2, %xmm1
-; GFNISSE-NEXT: packuswb %xmm0, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: movdqa %xmm0, %xmm1
+; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT: psllw $8, %xmm1
+; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v16i8:
; GFNIAVX1: # %bb.0:
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: constant_shl_v16i8:
@@ -1224,72 +1218,57 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [128,64,32,16,8,4,2,1]
-; GFNISSE-NEXT: pmullw %xmm4, %xmm0
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; GFNISSE-NEXT: pand %xmm5, %xmm0
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: pmullw %xmm6, %xmm2
-; GFNISSE-NEXT: pand %xmm5, %xmm2
-; GFNISSE-NEXT: packuswb %xmm0, %xmm2
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmullw %xmm4, %xmm1
-; GFNISSE-NEXT: pand %xmm5, %xmm1
-; GFNISSE-NEXT: pmullw %xmm6, %xmm3
-; GFNISSE-NEXT: pand %xmm5, %xmm3
-; GFNISSE-NEXT: packuswb %xmm1, %xmm3
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa %xmm3, %xmm1
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm3
+; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT: pand %xmm4, %xmm3
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
+; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0
+; GFNISSE-NEXT: psllw $8, %xmm0
+; GFNISSE-NEXT: por %xmm3, %xmm0
+; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
+; GFNISSE-NEXT: pand %xmm4, %xmm3
+; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1
+; GFNISSE-NEXT: psllw $8, %xmm1
+; GFNISSE-NEXT: por %xmm3, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1]
-; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
+; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm3
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
+; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm2
; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
-; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: constant_shl_v32i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: constant_shl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; GFNIAVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
; GFNIAVX512VL-NEXT: retq
;
; GFNIAVX512BW-LABEL: constant_shl_v32i8:
@@ -2588,140 +2567,109 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v64i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm7 = [128,64,32,16,8,4,2,1]
-; GFNISSE-NEXT: pmullw %xmm7, %xmm1
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; GFNISSE-NEXT: pand %xmm6, %xmm1
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: pmullw %xmm8, %xmm0
-; GFNISSE-NEXT: pand %xmm6, %xmm0
-; GFNISSE-NEXT: packuswb %xmm1, %xmm0
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmullw %xmm7, %xmm4
-; GFNISSE-NEXT: pand %xmm6, %xmm4
-; GFNISSE-NEXT: pmullw %xmm8, %xmm1
-; GFNISSE-NEXT: pand %xmm6, %xmm1
-; GFNISSE-NEXT: packuswb %xmm4, %xmm1
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmullw %xmm7, %xmm2
-; GFNISSE-NEXT: pand %xmm6, %xmm2
-; GFNISSE-NEXT: pmullw %xmm8, %xmm4
-; GFNISSE-NEXT: pand %xmm6, %xmm4
-; GFNISSE-NEXT: packuswb %xmm2, %xmm4
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNISSE-NEXT: pmullw %xmm7, %xmm3
-; GFNISSE-NEXT: pand %xmm6, %xmm3
-; GFNISSE-NEXT: pmullw %xmm8, %xmm5
-; GFNISSE-NEXT: pand %xmm6, %xmm5
-; GFNISSE-NEXT: packuswb %xmm3, %xmm5
-; GFNISSE-NEXT: movdqa %xmm4, %xmm2
-; GFNISSE-NEXT: movdqa %xmm5, %xmm3
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
+; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm0
+; GFNISSE-NEXT: psllw $8, %xmm0
+; GFNISSE-NEXT: por %xmm6, %xmm0
+; GFNISSE-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1
+; GFNISSE-NEXT: psllw $8, %xmm1
+; GFNISSE-NEXT: por %xmm6, %xmm1
+; GFNISSE-NEXT: movdqa %xmm2, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2
+; GFNISSE-NEXT: psllw $8, %xmm2
+; GFNISSE-NEXT: por %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3
+; GFNISSE-NEXT: psllw $8, %xmm3
+; GFNISSE-NEXT: por %xmm6, %xmm3
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v64i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [128,64,32,16,8,4,2,1]
-; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,4,16,64,128,32,8,2]
+; GFNIAVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm4
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
-; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
+; GFNIAVX1-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; GFNIAVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmaddubsw %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; GFNIAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: constant_shl_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
-; GFNIAVX2-NEXT: # ymm3 = mem[0,1,0,1]
-; GFNIAVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; GFNIAVX2-NEXT: vpand %ymm4, %y...
[truncated]
|
llvm/test/CodeGen/X86/combine-mul.ll
Outdated
; SSE-NEXT: packuswb %xmm0, %xmm1 | ||
; SSE-NEXT: movdqa %xmm1, %xmm0 | ||
; SSE-NEXT: movdqa %xmm0, %xmm1 | ||
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It took me some second to guess the data. Would be helpful if we can show it in comment like
constant = 0, 1, 0, 1...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I'd like to get that done - let me see what I can put together in AsmPrinter. I did a lot of the heavy lifting for X86FixupVectorConstants.
I think we considered adding asm comments like this a few years ago but it never went anywhere - the last I remember was discussion about whether float numbers should always be fixed width hex.... @topperc might remember better.
@phoebewang Do you have any thoughts on non-constants? I could add a IsPMADDUBSWSlow tuning tag for SandyBridge, but it'd be annoying if we had to set it for any of the generic x86-64-v levels as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TEST changes LGTM
Do you mean the problem that early Core CPUs' TP = 1? https://uops.info/table.html?search=PMADDUBSW&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SNB=on&cb_IVB=on&cb_SKL=on&cb_AMT=on&cb_ZEN3=on&cb_measurements=on&cb_doc=on&cb_base=on&cb_avx=on&cb_avx2=on&cb_sse=on The latter seems in latest CPUs as well. |
AFAICT this used to put all the PMADDUBSW and PSLLW instructions on Port0, while the older PMULLW+UNPCK/PACK is spread between Port0 and Port15 |
Oh, I forgot PSLLW. Do you take PAND/PANDN into account? I think non-constant cases need them, right? |
This is the breakdown I created for #90748 https://llvm.godbolt.org/z/9361GKrds |
I see the cycles is still slightly better on SandyBridge. I'm ok to start without a tuning tag. We can revisit it if we notice notable performance drop. |
Thanks - I'm going to get the constant comment code done first, then get this patch committed, then I'll raise a new PR for non-constant multiplies. |
Based on feedback from #95403 - we use multiply by constant for various lowerings (shifts, division etc.), so its very useful to printout the constants to help understand the transform involved. vXi16 multiplies are the easiest to add for this initial commit, but we can add other arithmetic instructions as follow ups when the need arises (I intend to add PMADDUBSW handling for #95403 next). I've done my best to update all test checks but there are bound to be ones that got missed that will only appear when the file is regenerated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Extends llvm#95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention. Fixes llvm#90748
Extends llvm#95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention. Fixes llvm#90748
Extends #95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets benefit from performing this for non-constant cases - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention (but lower instruction count). Fixes #90748
…5690) Extends llvm#95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets benefit from performing this for non-constant cases - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention (but lower instruction count). Fixes llvm#90748
As discussed on #90748 - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together.
This patch limits the lowering to multiply by constants, but most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/5 contention.