Skip to content

Commit 4b0885d

Browse files
committed
[CodeGen] [AMDGPU] Attempt DAGCombine for fmul with select to ldexp
For the f32/f16, this combine does no improvements, but for f64 this specific case of fmul with select is more costly to materialize as compared to ldexp, so the following dag combine does the magic. fmul x, select(y, 2.0, 1.0) -> ldexp x, zext(i1 y) fmul x, selcet(y, 0.5, 1.0) -> ldexp x, sext(i1 y) Thus, it solves the issue #104900.
1 parent 7537142 commit 4b0885d

File tree

3 files changed

+335
-0
lines changed

3 files changed

+335
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
899899
ISD::FADD,
900900
ISD::FSUB,
901901
ISD::FDIV,
902+
ISD::FMUL,
902903
ISD::FMINNUM,
903904
ISD::FMAXNUM,
904905
ISD::FMINNUM_IEEE,
@@ -14476,6 +14477,57 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
1447614477
return SDValue();
1447714478
}
1447814479

14480+
SDValue SITargetLowering::performFMulCombine(SDNode *N,
14481+
DAGCombinerInfo &DCI) const {
14482+
SelectionDAG &DAG = DCI.DAG;
14483+
EVT VT = N->getValueType(0);
14484+
14485+
SDLoc SL(N);
14486+
SDValue LHS = N->getOperand(0);
14487+
SDValue RHS = N->getOperand(1);
14488+
14489+
// ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
14490+
// ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
14491+
//
14492+
// The above mentioned ldexp folding works fine for
14493+
// f16/f32, but as for f64 it creates f64 select which
14494+
// is costly to materealize as compared to f64 ldexp
14495+
// so here we undo the transform for f64 as follows :
14496+
//
14497+
// fmul x, (select y, 2.0, 1.0) -> ldexp(x, zext(i1 y))
14498+
// fmul x, (select y, 0.5, 1.0) -> ldexp(x, sext(i1 y))
14499+
// TODO : Need to handle vector of f64 type.
14500+
if (VT == MVT::f64) {
14501+
if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
14502+
const ConstantFPSDNode *TrueNode =
14503+
dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
14504+
const ConstantFPSDNode *FalseNode =
14505+
dyn_cast<ConstantFPSDNode>(RHS.getOperand(2));
14506+
14507+
if (!TrueNode || !FalseNode)
14508+
return SDValue();
14509+
14510+
const double TrueVal = TrueNode->getValueAPF().convertToDouble();
14511+
const double FalseVal = FalseNode->getValueAPF().convertToDouble();
14512+
unsigned ExtOp;
14513+
14514+
if (FalseVal == 1.0) {
14515+
if (TrueVal == 2.0)
14516+
ExtOp = ISD::ZERO_EXTEND;
14517+
else if (TrueVal == 0.5)
14518+
ExtOp = ISD::SIGN_EXTEND;
14519+
else
14520+
return SDValue();
14521+
14522+
SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
14523+
return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
14524+
}
14525+
}
14526+
}
14527+
14528+
return SDValue();
14529+
}
14530+
1447914531
SDValue SITargetLowering::performFMACombine(SDNode *N,
1448014532
DAGCombinerInfo &DCI) const {
1448114533
SelectionDAG &DAG = DCI.DAG;
@@ -14765,6 +14817,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1476514817
return performFSubCombine(N, DCI);
1476614818
case ISD::FDIV:
1476714819
return performFDivCombine(N, DCI);
14820+
case ISD::FMUL:
14821+
return performFMulCombine(N, DCI);
1476814822
case ISD::SETCC:
1476914823
return performSetCCCombine(N, DCI);
1477014824
case ISD::FMAXNUM:

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
218218
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
219219
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
220220
SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const;
221+
SDValue performFMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
221222
SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
222223
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
223224
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
3+
;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
4+
;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
5+
6+
define float @fmul_select_f32_test1(float %x, i1 %bool) {
7+
; GFX9-LABEL: fmul_select_f32_test1:
8+
; GFX9: ; %bb.0:
9+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
11+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
12+
; GFX9-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
13+
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
14+
; GFX9-NEXT: s_setpc_b64 s[30:31]
15+
;
16+
; GFX1030-LABEL: fmul_select_f32_test1:
17+
; GFX1030: ; %bb.0:
18+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19+
; GFX1030-NEXT: v_and_b32_e32 v1, 1, v1
20+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
21+
; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
22+
; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
23+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
24+
;
25+
; GFX1100-LABEL: fmul_select_f32_test1:
26+
; GFX1100: ; %bb.0:
27+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28+
; GFX1100-NEXT: v_and_b32_e32 v1, 1, v1
29+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
30+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
31+
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
32+
; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
33+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
34+
%1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
35+
%ldexp = fmul float %x, %1
36+
ret float %ldexp
37+
}
38+
39+
define float @fmul_select_f32_test2(float %x, i1 %bool) {
40+
; GFX9-LABEL: fmul_select_f32_test2:
41+
; GFX9: ; %bb.0:
42+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
44+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
45+
; GFX9-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
46+
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
47+
; GFX9-NEXT: s_setpc_b64 s[30:31]
48+
;
49+
; GFX1030-LABEL: fmul_select_f32_test2:
50+
; GFX1030: ; %bb.0:
51+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52+
; GFX1030-NEXT: v_and_b32_e32 v1, 1, v1
53+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
54+
; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
55+
; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
56+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
57+
;
58+
; GFX1100-LABEL: fmul_select_f32_test2:
59+
; GFX1100: ; %bb.0:
60+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61+
; GFX1100-NEXT: v_and_b32_e32 v1, 1, v1
62+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
63+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
64+
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
65+
; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
66+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
67+
%1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
68+
%ldexp = fmul float %x, %1
69+
ret float %ldexp
70+
}
71+
72+
define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
73+
; GFX9-LABEL: fmul_select_v2f32_test1:
74+
; GFX9: ; %bb.0:
75+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76+
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
77+
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
78+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
79+
; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
80+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
81+
; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
82+
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
83+
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
84+
; GFX9-NEXT: s_setpc_b64 s[30:31]
85+
;
86+
; GFX1030-LABEL: fmul_select_v2f32_test1:
87+
; GFX1030: ; %bb.0:
88+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89+
; GFX1030-NEXT: v_and_b32_e32 v2, 1, v2
90+
; GFX1030-NEXT: v_and_b32_e32 v3, 1, v3
91+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
92+
; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
93+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
94+
; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
95+
; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
96+
; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
97+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
98+
;
99+
; GFX1100-LABEL: fmul_select_v2f32_test1:
100+
; GFX1100: ; %bb.0:
101+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102+
; GFX1100-NEXT: v_and_b32_e32 v2, 1, v2
103+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
104+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
105+
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
106+
; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
107+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
108+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
109+
; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
110+
; GFX1100-NEXT: v_mul_f32_e32 v1, v1, v3
111+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
112+
%1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
113+
%ldexp = fmul <2 x float> %x, %1
114+
ret <2 x float> %ldexp
115+
}
116+
117+
define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
118+
; GFX9-LABEL: fmul_select_v2f32_test2:
119+
; GFX9: ; %bb.0:
120+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121+
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
122+
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
123+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
124+
; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
125+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
126+
; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
127+
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
128+
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
129+
; GFX9-NEXT: s_setpc_b64 s[30:31]
130+
;
131+
; GFX1030-LABEL: fmul_select_v2f32_test2:
132+
; GFX1030: ; %bb.0:
133+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134+
; GFX1030-NEXT: v_and_b32_e32 v2, 1, v2
135+
; GFX1030-NEXT: v_and_b32_e32 v3, 1, v3
136+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
137+
; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
138+
; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
139+
; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
140+
; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
141+
; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
142+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
143+
;
144+
; GFX1100-LABEL: fmul_select_v2f32_test2:
145+
; GFX1100: ; %bb.0:
146+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147+
; GFX1100-NEXT: v_and_b32_e32 v2, 1, v2
148+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
149+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
150+
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
151+
; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
152+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
153+
; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
154+
; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
155+
; GFX1100-NEXT: v_mul_f32_e32 v1, v1, v3
156+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
157+
%1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
158+
%ldexp = fmul <2 x float> %x, %1
159+
ret <2 x float> %ldexp
160+
}
161+
162+
define double @fmul_select_f64_test1(double %x, i1 %bool) {
163+
; GFX9-LABEL: fmul_select_f64_test1:
164+
; GFX9: ; %bb.0:
165+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166+
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
167+
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
168+
; GFX9-NEXT: s_setpc_b64 s[30:31]
169+
;
170+
; GFX1030-LABEL: fmul_select_f64_test1:
171+
; GFX1030: ; %bb.0:
172+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173+
; GFX1030-NEXT: v_and_b32_e32 v2, 1, v2
174+
; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
175+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
176+
;
177+
; GFX1100-LABEL: fmul_select_f64_test1:
178+
; GFX1100: ; %bb.0:
179+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180+
; GFX1100-NEXT: v_and_b32_e32 v2, 1, v2
181+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
182+
; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
183+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
184+
%1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
185+
%ldexp = fmul double %x, %1
186+
ret double %ldexp
187+
}
188+
189+
define double @fmul_select_f64_test2(double %x, i1 %bool) {
190+
; GFX9-LABEL: fmul_select_f64_test2:
191+
; GFX9: ; %bb.0:
192+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193+
; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 1
194+
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
195+
; GFX9-NEXT: s_setpc_b64 s[30:31]
196+
;
197+
; GFX1030-LABEL: fmul_select_f64_test2:
198+
; GFX1030: ; %bb.0:
199+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200+
; GFX1030-NEXT: v_bfe_i32 v2, v2, 0, 1
201+
; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
202+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
203+
;
204+
; GFX1100-LABEL: fmul_select_f64_test2:
205+
; GFX1100: ; %bb.0:
206+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207+
; GFX1100-NEXT: v_bfe_i32 v2, v2, 0, 1
208+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
209+
; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
210+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
211+
%1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
212+
%ldexp = fmul double %x, %1
213+
ret double %ldexp
214+
}
215+
216+
define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
217+
; GFX9-LABEL: fmul_select_v2f64_test1:
218+
; GFX9: ; %bb.0:
219+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220+
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
221+
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
222+
; GFX9-NEXT: v_and_b32_e32 v4, 1, v5
223+
; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
224+
; GFX9-NEXT: s_setpc_b64 s[30:31]
225+
;
226+
; GFX1030-LABEL: fmul_select_v2f64_test1:
227+
; GFX1030: ; %bb.0:
228+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229+
; GFX1030-NEXT: v_and_b32_e32 v4, 1, v4
230+
; GFX1030-NEXT: v_and_b32_e32 v5, 1, v5
231+
; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
232+
; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
233+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
234+
;
235+
; GFX1100-LABEL: fmul_select_v2f64_test1:
236+
; GFX1100: ; %bb.0:
237+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238+
; GFX1100-NEXT: v_and_b32_e32 v4, 1, v4
239+
; GFX1100-NEXT: v_and_b32_e32 v5, 1, v5
240+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
241+
; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
242+
; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
243+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
244+
%1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
245+
%ldexp = fmul <2 x double> %x, %1
246+
ret <2 x double> %ldexp
247+
}
248+
249+
define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
250+
; GFX9-LABEL: fmul_select_v2f64_test2:
251+
; GFX9: ; %bb.0:
252+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253+
; GFX9-NEXT: v_bfe_i32 v4, v4, 0, 1
254+
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
255+
; GFX9-NEXT: v_bfe_i32 v4, v5, 0, 1
256+
; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
257+
; GFX9-NEXT: s_setpc_b64 s[30:31]
258+
;
259+
; GFX1030-LABEL: fmul_select_v2f64_test2:
260+
; GFX1030: ; %bb.0:
261+
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262+
; GFX1030-NEXT: v_bfe_i32 v4, v4, 0, 1
263+
; GFX1030-NEXT: v_bfe_i32 v5, v5, 0, 1
264+
; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
265+
; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
266+
; GFX1030-NEXT: s_setpc_b64 s[30:31]
267+
;
268+
; GFX1100-LABEL: fmul_select_v2f64_test2:
269+
; GFX1100: ; %bb.0:
270+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271+
; GFX1100-NEXT: v_bfe_i32 v4, v4, 0, 1
272+
; GFX1100-NEXT: v_bfe_i32 v5, v5, 0, 1
273+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
274+
; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
275+
; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
276+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
277+
%1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
278+
%ldexp = fmul <2 x double> %x, %1
279+
ret <2 x double> %ldexp
280+
}

0 commit comments

Comments
 (0)