-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AMDGPU] Adds pre-commit test for fmul-select combine #111107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Adds pre-commit test for fmul-select combine #111107
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Vikash Gupta (vg0204) ChangesThis adds the f32/f64 test cases for below pattern : fmul x, select(y, 2.0, 1.0) It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in f64 case. Full diff: https://github.com/llvm/llvm-project/pull/111107.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
new file mode 100644
index 00000000000000..c20cf332422fef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test1:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
+ %ldexp = fmul float %x, %1
+ ret float %ldexp
+}
+
+define float @fmul_select_f32_test2(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test2:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test2:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
+ %ldexp = fmul float %x, %1
+ ret float %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test1:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+ %ldexp = fmul <2 x float> %x, %1
+ ret <2 x float> %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test2:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test2:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+ %ldexp = fmul <2 x float> %x, %1
+ ret <2 x float> %ldexp
+}
+
+define double @fmul_select_f64_test1(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 2.0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test1:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v3, 1, v2
+; GFX1030-NEXT: v_mov_b32_e32 v2, 0
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT: v_cndmask_b32_e64 v3, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 1, v2
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT: v_cndmask_b32_e64 v3, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
+ %ldexp = fmul double %x, %1
+ ret double %ldexp
+}
+
+define double @fmul_select_f64_test2(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x3fe00000
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test2:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v3, 1, v2
+; GFX1030-NEXT: v_mov_b32_e32 v4, 0x3fe00000
+; GFX1030-NEXT: v_mov_b32_e32 v2, 0
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT: v_cndmask_b32_e32 v3, 0x3ff00000, v4, vcc_lo
+; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test2:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_dual_mov_b32 v4, 0x3fe00000 :: v_dual_and_b32 v3, 1, v2
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v3, 0x3ff00000, v4
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
+ %ldexp = fmul double %x, %1
+ ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x3ff00000
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 2.0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, 2.0, vcc
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test1:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX1030-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT: v_mov_b32_e32 v4, 0
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1030-NEXT: v_mov_b32_e32 v6, v4
+; GFX1030-NEXT: v_cndmask_b32_e64 v7, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo
+; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX1100-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v5, 1, v5
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1100-NEXT: v_mov_b32_e32 v6, v4
+; GFX1100-NEXT: v_cndmask_b32_e64 v7, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+ %ldexp = fmul <2 x double> %x, %1
+ ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x3ff00000
+; GFX9-NEXT: v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test2:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX1030-NEXT: v_mov_b32_e32 v8, 0x3fe00000
+; GFX1030-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT: v_mov_b32_e32 v4, 0
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1030-NEXT: v_mov_b32_e32 v6, v4
+; GFX1030-NEXT: v_cndmask_b32_e32 v7, 0x3ff00000, v8, vcc_lo
+; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0x3ff00000, v8, vcc_lo
+; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test2:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_and_b32_e32 v6, 1, v4
+; GFX1100-NEXT: v_dual_mov_b32 v8, 0x3fe00000 :: v_dual_and_b32 v5, 1, v5
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX1100-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v7, 0x3ff00000, v8
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX1100-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_cndmask_b32 v5, 0x3ff00000, v8
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[6:7]
+; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+ %ldexp = fmul <2 x double> %x, %1
+ ret <2 x double> %ldexp
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
almost there, but few more cases might be useful
%bool = icmp eq i32 %bool.arg1, %bool.arg2 | ||
%y = select i1 %bool, half -1.600000e+01, half -3.200000e+01 | ||
%ldexp = fmul half %x, %y | ||
ret half %ldexp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a few bfloat cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also vectors
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s | ||
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s | ||
|
||
define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a couple cases with inreg arguments for the scalar case
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did not understand what you mean? Can you give one very small example!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define float @s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1, i32 inreg %bool.arg2) {
(Could also replace bool.arg2 with a constant everywhere)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't add the scalar cases?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping on scalar test
Can you give small example, I didn't understood before itself!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define float @v_s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1) {
%bool = icmp eq i32 %bool.arg1, 0
%y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
%ldexp = fmul float %x, %y
ret float %ldexp
}
; with scalar output
define amdgpu_ps i32 @s_fmul_select_f32_test1(float inreg %x, i32 inreg %bool.arg1) {
%bool = icmp eq i32 %bool.arg1, 0
%y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
%ldexp = fmul float %x, %y
%cast = bitcast float %ldexp to i32
ret i32 %cast
}
Test really high powers of 2. Particularly the cases where the exponent is > 64 (or < -16) and thus no longer an inline immediate |
Note : -3.0223145e+23 = -2^78 and -7.2057594e+16 = -2^56
|
You can only use the decimal notation for cases that are exactly representable in decimal. You need to use the standard hex representation (I really wish the IR would switch to using C99 hex float notation) |
You can always write something simpler and let constant folding show you how it should be printed: https://godbolt.org/z/PsdMnq37T |
4efec17
to
d963aec
Compare
This adds the f32/f64/f16/bf16 test cases for below pattern : fmul x, select(y, A, B) where A & B could be inline/non-inline values It acts as pre-commit tests for dagCombine above pattern into cheaper ldexp in f64 case.
d963aec
to
650131a
Compare
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s | ||
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s | ||
|
||
define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping on scalar test
Can you give small example, I didn't understood before itself!
Kind Ping! |
Ping @arsenm! |
This adds the f32/f64/f16/bf16 test cases for below pattern : `fmul x, select(y, A, B)` with just one use of select Inst above. It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in case of non-inlline 32 bit-constants. (llvm#111109) Change-Id: Ia6a3bb41b25ca8fb3d3f5bc67c183c168d8f4ba8
This adds the f32/f64/f16/bf16 test cases for below pattern :
fmul x, select(y, A, B)
with just one use of select Inst above.
It acts as pre-commit tests for dagCombining above pattern into cheaper ldexp in case of non-inlline 32 bit-constants. (#111109)