-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AMDGPU] Add support for v_cvt_f32_fp8
on gfx1250
#147579
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesCo-authored-by: Mekhanoshin, Stanislav <[email protected]> Patch is 59.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147579.diff 26 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fb358297a5eed..a5ee8013adff6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -429,6 +429,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-conversion-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8_e5m3, "fiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-conversion-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-conversion-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-conversion-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 4d4afedae3658..421099d3876e3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -139,3 +139,41 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
{
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+
+// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP0]], i32 0)
+// CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP1]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP3]], i32 1)
+// CHECK-NEXT: [[CONV1:%.*]] = fptosi float [[TMP4]] to i32
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[CONV1]], ptr addrspace(1) [[TMP5]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP6]], i32 2)
+// CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[TMP7]] to i32
+// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[CONV2]], ptr addrspace(1) [[TMP8]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP9]], i32 3)
+// CHECK-NEXT: [[CONV3:%.*]] = fptosi float [[TMP10]] to i32
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[CONV3]], ptr addrspace(1) [[TMP11]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_f32_fp8_e5m3(global int* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 0);
+ *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 1);
+ *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
+ *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 3ba0d50e79031..7494c4f984353 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -27,3 +27,8 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
+
+void test_cvt_f32_fp8_e5m3(global int* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f7a9b65854696..16885f331e9dd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3402,6 +3402,12 @@ def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
+// llvm.amdgcn.cvt.f32.fp8.e5m3 float vdst, int srcA, imm byte_sel [0..3]
+def int_amdgcn_cvt_f32_fp8_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8_e5m3">,
+ DefaultAttrsIntrinsic<[llvm_float_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1a1c32fba9d18..a66b9c7285796 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -819,6 +819,12 @@ def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
"Has fp8 and bf8 conversion instructions"
>;
+def FeatureFP8E5M3Insts : SubtargetFeature<"fp8e5m3-insts",
+ "HasFP8E5M3Insts",
+ "true",
+ "Has fp8 e5m3 format support"
+>;
+
def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
"HasCvtFP8Vop1Bug",
"true",
@@ -1937,6 +1943,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureAtomicBufferPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureFP8ConversionInsts,
+ FeatureFP8E5M3Insts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
@@ -2573,6 +2580,12 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
+def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">,
+ AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>;
+
+def NotHasFP8E5M3Insts : Predicate<"!Subtarget->hasFP8E5M3Insts()">,
+ AssemblerPredicate<(all_of (not FeatureFP8E5M3Insts))>;
+
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 353fb23fa1520..1483d97d23fcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4600,6 +4600,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
case Intrinsic::amdgcn_cvt_f32_fp8:
+ case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
case Intrinsic::amdgcn_cvt_f32_bf8:
case Intrinsic::amdgcn_cvt_off_f32_i4:
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 628f1a69865ed..3af140461afdb 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -9566,13 +9566,13 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
}
}
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp) && !IsVOP3CvtSrDpp)
addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTyByteSel);
+ AMDGPUOperand::ImmTyClamp);
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTyClamp);
+ AMDGPUOperand::ImmTyByteSel);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 45721422edcf1..f38076ea7c468 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -165,6 +165,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasMAIInsts = false;
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
+ bool HasFP8E5M3Insts = false;
bool HasCvtFP8Vop1Bug = false;
bool HasPkFmacF16Inst = false;
bool HasAtomicFMinFMaxF32GlobalInsts = false;
@@ -861,6 +862,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
+ bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
+
bool hasPkFmacF16Inst() const {
return HasPkFmacF16Inst;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0dacd9df71305..5e88684c102ce 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -676,13 +676,14 @@ let HasClamp = 0, HasOMod = 0, HasExtDPP = 0, HasExtVOP3DPP = 0,
}
}
-class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
+class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT, bit _HasClamp = 0> :
+ VOPProfile<[DstVT, i32, untyped, untyped]> {
+ let HasClamp = _HasClamp;
let HasFP8SrcByteSel = 1;
let HasOpSel = 0;
let HasExtDPP = 1;
let HasExtVOP3DPP = 1;
let HasExtSDWA = 0;
- let HasClamp = 0;
let HasOMod = 0;
let HasModifiers = 0;
}
@@ -695,7 +696,12 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
- defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
+ // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
+ let SubtargetPredicate = isGFX12PlusNot12_50 in
+ defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
+ let SubtargetPredicate = isGFX125xOnly in
+ defm V_CVT_F32_FP8_gfx1250 : VOP1Inst<"v_cvt_f32_fp8_gfx1250", VOPProfile_Base_CVT_F_F8_ByteSel<f32, 1>>;
+
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
let True16Predicate = UseFakeTrue16Insts in {
@@ -714,9 +720,19 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
(inst $src0, (as_i32timm $byte_sel)))
>;
-let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
- def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
- def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
+let OtherPredicates = [HasFP8ConversionInsts] in {
+ // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
+ let SubtargetPredicate = isGFX12PlusNot12_50 in
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
+ let SubtargetPredicate = isGFX125xOnly in {
+ def : GCNPat<(int_amdgcn_cvt_f32_fp8 i32:$src0, timm:$byte_sel),
+ (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.NONE, (as_i32timm $byte_sel))>;
+ def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
+ (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
+ }
+ // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
+ let SubtargetPredicate = isGFX12Plus in
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}
class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
@@ -1038,7 +1054,9 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
-defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
+
defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f8edfec1100a2..3e01f8cd044e2 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -352,13 +352,13 @@ class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gf
let Inst{14} = !if(!and(p.HasOpSel, p.HasDst), src0_modifiers{3}, 0);
}
- class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
- bits<2> byte_sel;
+class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ bits<2> byte_sel;
- let Inst{11} = 0; // op_sel0
- let Inst{12} = 0; // op_sel1
- let Inst{14-13} = byte_sel; // op_sel2/3
- }
+ let Inst{11} = !if(!and(p.HasOpSel, p.HasSrc0Mods), src0_modifiers{2}, 0); // op_sel0
+ let Inst{12} = !if(!and(p.HasOpSel, p.HasSrc1Mods), src1_modifiers{2}, 0); // op_sel1
+ let Inst{14-13} = byte_sel; // op_sel2/3
+}
class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> :
VOP3e_t16_gfx11_gfx12<op, p>{
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index cae12f9a4ed3e..31123c5eb7ab7 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -445,6 +445,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["prng-inst"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["fp8-conversion-insts"] = true;
+ Features["fp8e5m3-insts"] = true;
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
new file mode 100644
index 0000000000000..20a26b19d4216
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32, i32)
+
+define float @test_cvt_f32_fp8_e5m3_byte0(i32 %a) {
+; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, v0 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 0)
+ ret float %ret
+}
+
+define float @test_cvt_f32_fp8_e5m3_byte1(i32 %a) {
+; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 1)
+ ret float %ret
+}
+
+define float @test_cvt_f32_fp8_e5m3_byte2(i32 %a) {
+; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 2)
+ ret float %ret
+}
+
+define float @test_cvt_f32_fp8_e5m3_byte3(i32 %a) {
+; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 3)
+ ret float %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 16d32b73b9b0d..09b1ea7c55afe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -6,6 +6,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
@@ -38,6 +41,13 @@ define float @test_cvt_f32_bf8_byte0(i32 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_cvt_f32_bf8_byte0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_bf8_e32 v0, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
ret float %ret
}
@@ -58,6 +68,13 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_cvt_f32_bf8_byte1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
ret float %ret
}
@@ -78,6 +95,13 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_cvt_f32_bf8_byte2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
ret float %ret
}
@@ -98,6 +122,13 @@ define float @test_cv...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/3/builds/18743 Here is the relevant piece of the build log for the reference
|
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
Co-authored-by: Mekhanoshin, Stanislav [email protected]