[AMDGPU] Adds pre-commit test for fmul-select combine #111107

vg0204 · 2024-10-04T07:27:37Z

This adds the f32/f64/f16/bf16 test cases for below pattern :

fmul x, select(y, A, B)
with just one use of select Inst above.

It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in case of non-inlline 32 bit-constants. (#111109)

llvmbot · 2024-10-04T07:28:08Z

@llvm/pr-subscribers-backend-amdgpu

Author: Vikash Gupta (vg0204)

Changes

This adds the f32/f64 test cases for below pattern :

fmul x, select(y, 2.0, 1.0)
fmul x, select(y, 0.5, 1.0)

It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in f64 case.

Full diff: https://github.com/llvm/llvm-project/pull/111107.diff

1 Files Affected:

(added) llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll (+342)

diff --git a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
new file mode 100644
index 00000000000000..c20cf332422fef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test2(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define double @fmul_select_f64_test1(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 2.0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test2(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3fe00000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0x3fe00000
+; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_cndmask_b32_e32 v3, 0x3ff00000, v4, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v4, 0x3fe00000 :: v_dual_and_b32 v3, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v3, 0x3ff00000, v4
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, 2.0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[5:6]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, 2.0, vcc
+; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX1030-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v7, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX1100-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v5, 1, v5
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1100-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v7, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[5:6]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX1030-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1030-NEXT:    v_cndmask_b32_e32 v7, 0x3ff00000, v8, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1030-NEXT:    v_cndmask_b32_e32 v5, 0x3ff00000, v8, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v6, 1, v4
+; GFX1100-NEXT:    v_dual_mov_b32 v8, 0x3fe00000 :: v_dual_and_b32 v5, 1, v5
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1100-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v7, 0x3ff00000, v8
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1100-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_cndmask_b32 v5, 0x3ff00000, v8
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1100-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}

llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

arsenm

almost there, but few more cases might be useful

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

arsenm · 2024-10-10T07:19:23Z

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
+  %ldexp = fmul half %x, %y
+  ret half %ldexp


Add a few bfloat cases

Also vectors

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

arsenm · 2024-10-10T07:26:35Z

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {


Add a couple cases with inreg arguments for the scalar case

I did not understand what you mean? Can you give one very small example!

define float @s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1, i32 inreg %bool.arg2) {

(Could also replace bool.arg2 with a constant everywhere)

Didn't add the scalar cases?

ping on scalar test

Can you give small example, I didn't understood before itself!

define float @v_s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1) { %bool = icmp eq i32 %bool.arg1, 0 %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00 %ldexp = fmul float %x, %y ret float %ldexp } ; with scalar output define amdgpu_ps i32 @s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1) { %bool = icmp eq i32 %bool.arg1, 0 %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00 %ldexp = fmul float %x, %y %cast = bitcast float %ldexp to i32 ret i32 %cast }

arsenm · 2024-10-10T07:30:06Z

Test really high powers of 2. Particularly the cases where the exponent is > 64 (or < -16) and thus no longer an inline immediate

vg0204 · 2024-10-10T11:21:39Z

Test really high powers of 2. Particularly the cases where the exponent is > 64 (or < -16) and thus no longer an inline immediate

define float @fmul_select_f32_test11(float %x, i32 %bool.arg1, i32 %bool.arg2) {
  %bool = icmp eq i32 %bool.arg1, %bool.arg2
  %y = select i1 %bool, float -3.0223145e+23, float -7.2057594e+16
  %ldexp = fmul float %x, %y
  ret float %ldexp
}

Note : -3.0223145e+23 = -2^78 and -7.2057594e+16 = -2^56
Whats wrong with this as it is giving this error?

/opt/compiler-explorer/clang-trunk/bin/llc: error: /opt/compiler-explorer/clang-trunk/bin/llc: <source>:72:31: error: floating point constant invalid for type
  %y = select i1 %bool, float -3.0223145e+23, float -7.2057594e+16
                              ^
Compiler returned: 1

arsenm · 2024-10-10T11:38:19Z

Test really high powers of 2. Particularly the cases where the exponent is > 64 (or < -16) and thus no longer an inline immediate
define float @fmul_select_f32_test11(float %x, i32 %bool.arg1, i32 %bool.arg2) {
  %bool = icmp eq i32 %bool.arg1, %bool.arg2
  %y = select i1 %bool, float -3.0223145e+23, float -7.2057594e+16
  %ldexp = fmul float %x, %y
  ret float %ldexp
}
Note : -3.0223145e+23 = -2^78 and -7.2057594e+16 = -2^56 Whats wrong with this as it is giving this error?
/opt/compiler-explorer/clang-trunk/bin/llc: error: /opt/compiler-explorer/clang-trunk/bin/llc: <source>:72:31: error: floating point constant invalid for type
  %y = select i1 %bool, float -3.0223145e+23, float -7.2057594e+16

You can only use the decimal notation for cases that are exactly representable in decimal. You need to use the standard hex representation (I really wish the IR would switch to using C99 hex float notation)

arsenm · 2024-10-10T11:40:28Z

You can always write something simpler and let constant folding show you how it should be printed: https://godbolt.org/z/PsdMnq37T

This adds the f32/f64/f16/bf16 test cases for below pattern : fmul x, select(y, A, B) where A & B could be inline/non-inline values It acts as pre-commit tests for dagCombine above pattern into cheaper ldexp in f64 case.

arsenm · 2024-10-21T22:04:05Z

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {


ping on scalar test

Can you give small example, I didn't understood before itself!

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

vg0204 · 2024-10-28T07:16:42Z

Kind Ping!

vg0204 · 2024-11-11T06:03:47Z

Ping @arsenm!

This adds the f32/f64/f16/bf16 test cases for below pattern : `fmul x, select(y, A, B)` with just one use of select Inst above. It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in case of non-inlline 32 bit-constants. (llvm#111109) Change-Id: Ia6a3bb41b25ca8fb3d3f5bc67c183c168d8f4ba8

llvmbot added the backend:AMDGPU label Oct 4, 2024

vg0204 self-assigned this Oct 4, 2024

vg0204 added the llvm:codegen label Oct 4, 2024

vg0204 requested a review from arsenm October 4, 2024 07:37

arsenm reviewed Oct 4, 2024

View reviewed changes

arsenm reviewed Oct 8, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll Show resolved Hide resolved

arsenm reviewed Oct 10, 2024

View reviewed changes

vg0204 mentioned this pull request Oct 10, 2024

[AMDGPU] InstCombine results in performance drop in ROCM's rocRAND library in MI100 #104900

Closed

vg0204 force-pushed the vg0204/pre-commit-test-ldexp-dag-combine branch from 4efec17 to d963aec Compare October 15, 2024 07:08

[CodeGen] [AMDGPU] Adds pre-commit test for fmul-select combine

650131a

This adds the f32/f64/f16/bf16 test cases for below pattern : fmul x, select(y, A, B) where A & B could be inline/non-inline values It acts as pre-commit tests for dagCombine above pattern into cheaper ldexp in f64 case.

vg0204 force-pushed the vg0204/pre-commit-test-ldexp-dag-combine branch from d963aec to 650131a Compare October 21, 2024 06:16

arsenm reviewed Oct 21, 2024

View reviewed changes

updated subtests name in accordance to usage of Hex floating values.

446b829

arsenm approved these changes Nov 22, 2024

View reviewed changes

vg0204 requested a review from arsenm November 25, 2024 08:42

arsenm changed the title ~~[CodeGen] [AMDGPU] Adds pre-commit test for fmul-select combine~~ [AMDGPU] Adds pre-commit test for fmul-select combine Nov 25, 2024

arsenm merged commit 0a140c4 into llvm:main Nov 25, 2024
6 of 8 checks passed

vg0204 deleted the vg0204/pre-commit-test-ldexp-dag-combine branch December 16, 2024 17:28

vg0204 restored the vg0204/pre-commit-test-ldexp-dag-combine branch December 16, 2024 17:28

[AMDGPU] Adds pre-commit test for fmul-select combine #111107

[AMDGPU] Adds pre-commit test for fmul-select combine #111107

Uh oh!

Conversation

vg0204 commented Oct 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 4, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm Oct 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 10, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm Oct 10, 2024

Choose a reason for hiding this comment

Uh oh!

vg0204 Oct 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 15, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 21, 2024 • edited by vg0204 Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 30, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm commented Oct 10, 2024

Uh oh!

vg0204 commented Oct 10, 2024

Uh oh!

arsenm commented Oct 10, 2024

Uh oh!

arsenm commented Oct 10, 2024

Uh oh!

arsenm Oct 21, 2024 • edited by vg0204 Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

vg0204 commented Oct 28, 2024

Uh oh!

vg0204 commented Nov 11, 2024

Uh oh!

Uh oh!

Uh oh!

vg0204 commented Oct 4, 2024 •

edited

Loading

arsenm Oct 21, 2024 •

edited by vg0204

Loading

arsenm Oct 21, 2024 •

edited by vg0204

Loading