diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index eba9bf64884ec..fa18a09b3831d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1615,6 +1615,9 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12 op, string opName, multiclass VOP2_Real_e32_gfx11_gfx12 op> : VOP2Only_Real, VOP2Only_Real; +multiclass VOP2_V_PK_FMAC_F16_gfx11_gfx12 op> : + VOP2Only_Real_e32, VOP2Only_Real_e32; + multiclass VOP3Only_Realtriple_gfx11_gfx12 op> : VOP3Only_Realtriple, VOP3Only_Realtriple; @@ -1661,7 +1664,8 @@ defm V_SUBREV_CO_CI_U32 : defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f, "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">; -defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx11_gfx12<0x03c>; + +defm V_PK_FMAC_F16 : VOP2_V_PK_FMAC_F16_gfx11_gfx12<0x03c>; defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">; defm V_ADD_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">; @@ -1945,6 +1949,11 @@ multiclass VOP2e_Real_gfx10 op, string opName, string asmName> : VOP2be_Real_dpp_gfx10, VOP2be_Real_dpp8_gfx10; +multiclass VOP2_FMAC_Real op> : + VOP2_Real_e32_gfx10, + VOP2_Real_dpp_gfx10, + VOP2_Real_dpp8_gfx10; + multiclass VOP2_Real_gfx10 op> : VOP2_Real_e32_gfx10, VOP2_Real_e64_gfx10, VOP2_Real_sdwa_gfx10, VOP2_Real_dpp_gfx10, VOP2_Real_dpp8_gfx10; @@ -1988,9 +1997,7 @@ defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; -let IsSingle = 1 in { - defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; -} +defm V_PK_FMAC_F16 : VOP2_FMAC_Real<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx10.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx10.mir new file mode 100644 index 0000000000000..7dc88a3768761 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx10.mir @@ -0,0 +1,17 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN + +# GCN-LABEL: name: v_pk_fmac_f16 +# GCN: %4:vgpr_32 = IMPLICIT_DEF +# GCN: %3:vgpr_32 = V_PK_FMAC_F16_dpp %4, 0, %1, 0, %1, 1, 15, 15, 1, implicit $mode, implicit $exec +name: v_pk_fmac_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index c48231f3851a7..44be207dd882a 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -166,6 +166,22 @@ body: | %6:vgpr_32 = V_FMAC_F32_e64 2, %4, 2, %1, 2, %2, 1, 2, implicit $mode, implicit $exec ... +# GCN-LABEL: name: v_pk_fmac_f16 +# GCN: %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec +# GCN: %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec +name: v_pk_fmac_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec +... + # when the DPP source isn't a src0 operand the operation should be commuted if possible # GCN-LABEL: name: dpp_commute_shrink # GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s index bf8e18ec14512..cd71ac7edaef1 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s @@ -13185,3 +13185,9 @@ v_pk_fmac_f16 v5, -4.0, v2 v_pk_fmac_f16 v5, v1, v255 // GFX10: encoding: [0x01,0xff,0x0b,0x78] + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff] + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 +// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s index da1989e2ee237..047267af44b98 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s @@ -187,3 +187,6 @@ v_mov_b16 v0.l, ttmp0.h v_mov_b16 v0.l, a0.h // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s index d25411b5bfd29..95836f3c897f4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s @@ -224,3 +224,6 @@ v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 +// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s index c695bc3600c38..f5afaa6bd6181 100644 --- a/llvm/test/MC/AMDGPU/literalv216.s +++ b/llvm/test/MC/AMDGPU/literalv216.s @@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid v_pk_fmac_f16 v5, 0x12345678, v2 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt index b759912204db8..33d89da3b3ae9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt @@ -1779,54 +1779,60 @@ # GFX10: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] 0x6a,0x04,0x0a,0x38 -# GFX10: v_pk_fmac_f16 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] +# GFX10: v_pk_fmac_f16_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] 0x01,0x05,0xfe,0x79 -# GFX10: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] 0xc1,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] 0xf7,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] 0x80,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] 0xf0,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] 0x7f,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] 0x7e,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] 0x7c,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] 0x01,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] 0x67,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] 0x77,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] 0x01,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] 0x01,0xff,0x0b,0x78 -# GFX10: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] 0xff,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] 0x6b,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] 0x6a,0x04,0x0a,0x78 +#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff] +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff + +#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03] +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03 + # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53] # W64: v_sub_co_ci_u32_e32 v255, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0xfe,0x53] 0x01,0x05,0xfe,0x53 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt index a022c79fe97e6..97c81ed1a629a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt @@ -144,5 +144,5 @@ # Packed VOP2 #===----------------------------------------------------------------------===// -# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] 0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12