Skip to content

[X86] ptest is commutable as long as only the Z flag is used. #88969

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -5688,6 +5688,13 @@ let Predicates = [UseSSE41, OptForSize] in {
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//

// ptest is commutable if only the Z flag is used. If the C flag is used,
// commuting would change which operand is inverted.
def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2),
(X86ptest node:$src1, node:$src2), [{
return onlyUsesZeroFlag(SDValue(Node, 0));
}]>;

// ptest instruction we'll lower to this in X86ISelLowering primarily from
// the intel intrinsic that corresponds to this.
let Defs = [EFLAGS], Predicates = [HasAVX] in {
Expand Down Expand Up @@ -5723,6 +5730,17 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
}

let Predicates = [HasAVX] in {
def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1),
(VPTESTrm VR128:$src1, addr:$src2)>;
def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1),
(VPTESTYrm VR256:$src1, addr:$src2)>;
}
let Predicates = [UseSSE41] in {
def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1),
(PTESTrm VR128:$src1, addr:$src2)>;
}

// The bit test instructions below are AVX only
multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
Expand All @@ -5737,6 +5755,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
}

// testps/testpd are commutable if only the Z flag is used. If the C flag is
// used, commuting would change which operand is inverted.
def X86testp_commutable : PatFrag<(ops node:$src1, node:$src2),
(X86testp node:$src1, node:$src2), [{
return onlyUsesZeroFlag(SDValue(Node, 0));
}]>;

let Defs = [EFLAGS], Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
Expand All @@ -5752,6 +5777,18 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
}
}

let Predicates = [HasAVX] in {
def : Pat<(X86testp_commutable (loadv4f32 addr:$src2), VR128:$src),
(VTESTPSrm VR128:$src, addr:$src2)>;
def : Pat<(X86testp_commutable (loadv8f32 addr:$src2), VR256:$src),
(VTESTPSYrm VR256:$src, addr:$src2)>;

def : Pat<(X86testp_commutable (loadv2f64 addr:$src2), VR128:$src),
(VTESTPDrm VR128:$src, addr:$src2)>;
def : Pat<(X86testp_commutable (loadv4f64 addr:$src2), VR256:$src),
(VTESTPDYrm VR256:$src, addr:$src2)>;
}

//===----------------------------------------------------------------------===//
// SSE4.1 - Misc Instructions
//===----------------------------------------------------------------------===//
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/combine-ptest.ll
Original file line number Diff line number Diff line change
Expand Up @@ -400,17 +400,15 @@ define i1 @PR38788(<4 x i32> %0, <4 x i32> %1) {
define i32 @PR88958_1(ptr %0, <2 x i64> %1) {
; SSE-LABEL: PR88958_1:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest %xmm0, %xmm1
; SSE-NEXT: ptest (%rdi), %xmm0
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; AVX-LABEL: PR88958_1:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vptest %xmm0, %xmm1
; AVX-NEXT: vptest (%rdi), %xmm0
; AVX-NEXT: sete %al
; AVX-NEXT: retq
%3 = load <2 x i64>, ptr %0
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/CodeGen/X86/combine-testpd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,88 @@ end: ; preds = %entry
ret void
}

define i32 @PR88958_1(ptr %0, <2 x double> %1) {
; SSE-LABEL: PR88958_1:
; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest (%rdi), %xmm0
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_1:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestpd (%rdi), %xmm0
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
%3 = load <2 x double>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %3, <2 x double> %1)
ret i32 %4
}

define i32 @PR88958_2(ptr %0, <2 x double> %1) {
; SSE-LABEL: PR88958_2:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest %xmm0, %xmm1
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %xmm1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestpd %xmm0, %xmm1
; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
%3 = load <2 x double>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %3, <2 x double> %1)
ret i32 %4
}

define i32 @PR88958_3(ptr %0, <4 x double> %1) {
; SSE-LABEL: PR88958_1:
; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest (%rdi), %xmm0
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_3:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestpd (%rdi), %ymm0
; CHECK-NEXT: sete %al
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%3 = load <4 x double>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %3, <4 x double> %1)
ret i32 %4
}

define i32 @PR88958_4(ptr %0, <4 x double> %1) {
; SSE-LABEL: PR88958_2:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest %xmm0, %xmm1
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %ymm1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestpd %ymm0, %ymm1
; CHECK-NEXT: setb %al
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%3 = load <4 x double>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %3, <4 x double> %1)
ret i32 %4
}

declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/CodeGen/X86/combine-testps.ll
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,88 @@ end: ; preds = %entry
ret void
}

define i32 @PR88958_1(ptr %0, <4 x float> %1) {
; SSE-LABEL: PR88958_1:
; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest (%rdi), %xmm0
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_1:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestps (%rdi), %xmm0
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
%3 = load <4 x float>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %3, <4 x float> %1)
ret i32 %4
}

define i32 @PR88958_2(ptr %0, <4 x float> %1) {
; SSE-LABEL: PR88958_2:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest %xmm0, %xmm1
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestps %xmm0, %xmm1
; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
%3 = load <4 x float>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %3, <4 x float> %1)
ret i32 %4
}

define i32 @PR88958_3(ptr %0, <8 x float> %1) {
; SSE-LABEL: PR88958_1:
; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest (%rdi), %xmm0
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_3:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestps (%rdi), %ymm0
; CHECK-NEXT: sete %al
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%3 = load <8 x float>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %3, <8 x float> %1)
ret i32 %4
}

define i32 @PR88958_4(ptr %0, <8 x float> %1) {
; SSE-LABEL: PR88958_2:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: ptest %xmm0, %xmm1
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; CHECK-LABEL: PR88958_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vtestps %ymm0, %ymm1
; CHECK-NEXT: setb %al
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%3 = load <8 x float>, ptr %0
%4 = tail call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %3, <8 x float> %1)
ret i32 %4
}

declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
Expand Down
32 changes: 19 additions & 13 deletions llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1018,32 +1018,38 @@ define zeroext i1 @PR44781(ptr %0) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: PR44781:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovdqu (%rdi), %xmm0
; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; AVX1OR2-NEXT: sete %al
; AVX1OR2-NEXT: retq
; AVX1-LABEL: PR44781:
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [15,15,15,15]
; AVX1-NEXT: vptest (%rdi), %xmm0
; AVX1-NEXT: sete %al
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR44781:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
; AVX2-NEXT: vptest (%rdi), %xmm0
; AVX2-NEXT: sete %al
; AVX2-NEXT: retq
;
; AVX512F-LABEL: PR44781:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
; AVX512F-NEXT: vptest (%rdi), %xmm0
; AVX512F-NEXT: sete %al
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: PR44781:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
; AVX512BW-NEXT: vptest (%rdi), %xmm0
; AVX512BW-NEXT: sete %al
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: PR44781:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455]
; AVX512BWVL-NEXT: vptest %xmm1, %xmm0
; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455]
; AVX512BWVL-NEXT: vptest (%rdi), %xmm0
; AVX512BWVL-NEXT: sete %al
; AVX512BWVL-NEXT: retq
%2 = load <4 x i32>, ptr %0, align 4
Expand Down