From f154d327dc86a99f7bcb9c2ae6f4cce817d250d5 Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Thu, 9 Sep 2021 12:38:15 -0700 Subject: [PATCH 1/3] [SYCL] Add support for __regcall calling convention to spir targets. This calling convention makes compiler return values and function arguments passed as values (through virtual registers) in most cases. The implementation is basically customizing SPIRABIInfo for CGFunctionInfo with the X86_RegCall calling convention. Code generation is mostly borrowed from AMDGPUABIInfo, but with some of the restrictions for passing by value removed. Signed-off-by: kbobrovs --- clang/lib/Basic/Targets/SPIR.h | 14 +- clang/lib/CodeGen/TargetInfo.cpp | 108 +++++- clang/test/CodeGenSYCL/regcall-cc-test.cpp | 360 ++++++++++++++++++ .../sycl_ext_intel_esimd.md | 110 +++++- 4 files changed, 585 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGenSYCL/regcall-cc-test.cpp diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 1f25d75932372..bd483e939485c 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -128,8 +128,14 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo { } CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { - return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK - : CCCR_Warning; + return (CC == CC_SpirFunction || CC == CC_OpenCLKernel || + // Permit CC_X86RegCall which is used to mark external functions + // with + // explicit simd or structure type arguments to pass them via + // registers. + CC == CC_X86RegCall) + ? CCCR_OK + : CCCR_Warning; } CallingConv getDefaultCallingConv() const override { @@ -286,8 +292,10 @@ class LLVM_LIBRARY_VISIBILITY WindowsX86_64_SPIR64TargetInfo } CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { - if (CC == CC_X86VectorCall) + if (CC == CC_X86VectorCall || CC == CC_X86RegCall) // Permit CC_X86VectorCall which is used in Microsoft headers + // Permit CC_X86RegCall which is used to mark external functions with + // explicit simd or structure type arguments to pass them via registers. return CCCR_OK; return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK : CCCR_Warning; diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index a0d38dcb798b8..3fc405e316b0e 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -10286,6 +10286,11 @@ class CommonSPIRABIInfo : public DefaultABIInfo { ABIArgInfo classifyKernelArgumentType(QualType Ty) const; + // Add new functions rather then overload existing so that these public APIs + // can't be blindly misused with wrong calling convention. + ABIArgInfo classifyRegcallReturnType(QualType RetTy) const; + ABIArgInfo classifyRegcallArgumentType(QualType RetTy) const; + void computeInfo(CGFunctionInfo &FI) const override; private: @@ -10305,17 +10310,114 @@ ABIArgInfo CommonSPIRABIInfo::classifyKernelArgumentType(QualType Ty) const { void CommonSPIRABIInfo::computeInfo(CGFunctionInfo &FI) const { llvm::CallingConv::ID CC = FI.getCallingConvention(); + bool IsRegCall = CC == llvm::CallingConv::X86_RegCall; - if (!getCXXABI().classifyReturnType(FI)) - FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); + if (!getCXXABI().classifyReturnType(FI)) { + CanQualType RetT = FI.getReturnType(); + FI.getReturnInfo() = + IsRegCall ? classifyRegcallReturnType(RetT) : classifyReturnType(RetT); + } for (auto &Arg : FI.arguments()) { if (CC == llvm::CallingConv::SPIR_KERNEL) { Arg.info = classifyKernelArgumentType(Arg.type); } else { - Arg.info = classifyArgumentType(Arg.type); + Arg.info = IsRegCall ? classifyRegcallArgumentType(Arg.type) + : classifyArgumentType(Arg.type); + } + } +} + +// The two functions below are based on AMDGPUABIInfo, but without any +// restriction on the maximum number of arguments passed via registers. +// SPIRV BEs are expected to further adjust the calling convention as +// needed (use stack or byval-like passing) for some of the arguments. + +ABIArgInfo CommonSPIRABIInfo::classifyRegcallReturnType(QualType RetTy) const { + if (isAggregateTypeForABI(RetTy)) { + // Records with non-trivial destructors/copy-constructors should not be + // returned by value. + if (!getRecordArgABI(RetTy, getCXXABI())) { + // Ignore empty structs/unions. + if (isEmptyRecord(getContext(), RetTy, true)) + return ABIArgInfo::getIgnore(); + + // Lower single-element structs to just return a regular value. + if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) + return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + + if (const RecordType *RT = RetTy->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return classifyReturnType(RetTy); + } + + // Pack aggregates <= 8 bytes into a single vector register or pair. + // TODO make this parameterizeable/adjustable depending on spir target + // triple abi component. + uint64_t Size = getContext().getTypeSize(RetTy); + if (Size <= 16) + return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); + + if (Size <= 32) + return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); + + if (Size <= 64) { + llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); + return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); + } + return ABIArgInfo::getDirect(); } } + // Otherwise just do the default thing. + return classifyReturnType(RetTy); +} + +ABIArgInfo CommonSPIRABIInfo::classifyRegcallArgumentType(QualType Ty) const { + Ty = useFirstFieldIfTransparentUnion(Ty); + + if (isAggregateTypeForABI(Ty)) { + // Records with non-trivial destructors/copy-constructors should not be + // passed by value. + if (auto RAA = getRecordArgABI(Ty, getCXXABI())) + return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); + + // Ignore empty structs/unions. + if (isEmptyRecord(getContext(), Ty, true)) + return ABIArgInfo::getIgnore(); + + // Lower single-element structs to just pass a regular value. TODO: We + // could do reasonable-size multiple-element structs too, using getExpand(), + // though watch out for things like bitfields. + if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) + return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + + if (const RecordType *RT = Ty->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return classifyArgumentType(Ty); + } + + // Pack aggregates <= 8 bytes into single vector register or pair. + // TODO make this parameterizeable/adjustable depending on spir target + // triple abi component. + uint64_t Size = getContext().getTypeSize(Ty); + if (Size <= 64) { + if (Size <= 16) + return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); + + if (Size <= 32) + return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); + + // XXX: Should this be i64 instead, and should the limit increase? + llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); + return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); + } + return ABIArgInfo::getDirect(); + } + + // Otherwise just do the default thing. + return classifyArgumentType(Ty); } class SPIRVABIInfo : public CommonSPIRABIInfo { diff --git a/clang/test/CodeGenSYCL/regcall-cc-test.cpp b/clang/test/CodeGenSYCL/regcall-cc-test.cpp new file mode 100644 index 0000000000000..371c702f10e76 --- /dev/null +++ b/clang/test/CodeGenSYCL/regcall-cc-test.cpp @@ -0,0 +1,360 @@ +// clang-format off +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -disable-llvm-passes -O0 -emit-llvm %s -o - | FileCheck %s + +// This test checks SYCL device compiler code generation for the __regcall +// functions. This calling convention makes return values and function arguments +// passed as values (through virtual registers) in most cases. + +// CHECK-DAG: target triple = "spir64-unknown-unknown" + +// ------------------- Positive test cases (pass by value) + +template using raw_vector = + T __attribute__((ext_vector_type(N))); + +template +struct simd { + raw_vector val; +}; + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE __attribute__((sycl_device)) +#else +#define SYCL_DEVICE +#endif + +template T __regcall func(T x) { return x.foo(); } + +// === TEST CASE: invoke_simd scenario, when sycl::ext::intel::esimd::simd +// objects used as return value and parameters + +SYCL_DEVICE simd __regcall SCALE(simd v); +// CHECK-DAG: declare x86_regcallcc <8 x float> @_Z17__regcall3__SCALE4simdIfLi8EE(<8 x float>) + +SYCL_DEVICE simd __regcall foo(simd x) { + return SCALE(x); +// CHECK-DAG: %{{[0-9a-zA-Z_.]+}} = call x86_regcallcc <8 x float> @_Z17__regcall3__SCALE4simdIfLi8EE(<8 x float> %{{[0-9a-zA-Z_.]+}}) +} + +// === TEST CASE: nested struct with different types of fields + +struct C { + float x, y; +}; +// CHECK-DAG: %struct.C = type { float, float } + +struct PassAsByval { + C a; + int *b; + raw_vector c; +}; +// CHECK-DAG: %struct.PassAsByval = type { %struct.C, i32 addrspace(4)*, <3 x float> } + +SYCL_DEVICE PassAsByval __regcall bar(PassAsByval x) { +// CHECK-DAG: define dso_local x86_regcallcc %struct.PassAsByval @_Z15__regcall3__bar11PassAsByval(%struct.C %{{[0-9a-zA-Z_.]+}}, i32 addrspace(4)* %{{[0-9a-zA-Z_.]+}}, <3 x float> %{{[0-9a-zA-Z_.]+}}) + x.a.x += 1; + return x; +} + +// === TEST CASE: multi-level nested structs with single primitive type element at the bottom + +struct A1 { char x; }; +struct B1 { A1 a; }; +struct C1 { + B1 b; + C1 foo() { return *this; } +}; +// CHECK-DAG: %struct.C1 = type { %struct.B1 } +// CHECK-DAG: %struct.B1 = type { %struct.A1 } +// CHECK-DAG: %struct.A1 = type { i8 } + +template SYCL_DEVICE C1 __regcall func(C1 x); +// CHECK-DAG: define weak_odr x86_regcallcc i8 @_Z16__regcall3__funcI2C1ET_S1_(i8 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with multiple elements at all levels + +struct A2 { char x; }; +struct B2 { A2 a; int* ptr; }; +struct C2 { + B2 b; + double c; + + C2 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C2 = type { %struct.B2, double } +// CHECK-DAG: %struct.B2 = type { %struct.A2, i32 addrspace(4)* } +// CHECK-DAG: %struct.A2 = type { i8 } + +template SYCL_DEVICE C2 __regcall func(C2 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C2 @_Z16__regcall3__funcI2C2ET_S1_(%struct.B2 %{{[0-9a-zA-Z_.]+}}, double %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with one primitive type element at +// the bottom, and one - at the top. The nested struct at the top is expected to +// get "unwraped" by the compiler evaporating to the single element at the +// bottom. + +struct A3 { char x; }; +struct B3 { A3 a; }; // unwrapped +struct C3 { // unwrapped + B3 b; + char c; + + C3 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C3 = type { %struct.B3, i8 } +// CHECK-DAG: %struct.B3 = type { %struct.A3 } +// CHECK-DAG: %struct.A3 = type { i8 } + +template SYCL_DEVICE C3 __regcall func(C3 x); +// CHECK-DAG: define weak_odr x86_regcallcc i16 @_Z16__regcall3__funcI2C3ET_S1_(i16 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with one primitive type element at +// the bottom, and one - at the top. The nested struct at the top is expected to +// get "unwraped" by the compiler evaporating to the single element at the +// bottom. + +struct A4 { char x; }; +struct B4 { A4 a; }; +struct C4 { + B4 b; + int *ptr; + + C4 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C4 = type { %struct.B4, i32 addrspace(4)* } +// CHECK-DAG: %struct.B4 = type { %struct.A4 } +// CHECK-DAG: %struct.A4 = type { i8 } + +template SYCL_DEVICE C4 __regcall func(C4 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C4 @_Z16__regcall3__funcI2C4ET_S1_(%struct.B4 %{{[0-9a-zA-Z_.]+}}, i32 addrspace(4)* %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with only leaf fields of primitive +// types. Unwrapping and merging should yield 2 32-bit integers + +struct A5a { char x; char y; }; +struct A5b { char x; char y; }; +struct B5 { A5a a; A5b b; }; +struct C5 { + B5 b1; + B5 b2; + + C5 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C5 = type { %struct.B5, %struct.B5 } +// CHECK-DAG: %struct.B5 = type { %struct.A5a, %struct.A5b } +// CHECK-DAG: %struct.A5a = type { i8, i8 } +// CHECK-DAG: %struct.A5b = type { i8, i8 } + +template SYCL_DEVICE C5 __regcall func(C5 x); +// CHECK-DAG: define weak_odr x86_regcallcc [2 x i32] @_Z16__regcall3__funcI2C5ET_S1_([2 x i32] %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with only leaf fields of primitive +// types. Unwrapping and merging should yield 2 32-bit integers + +struct B6 { int *a; int b; }; +struct C6 { + B6 b; + char x; + char y; + + C6 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C6 = type { %struct.B6, i8, i8 } +// CHECK-DAG: %struct.B6 = type { i32 addrspace(4)*, i32 } + +template SYCL_DEVICE C6 __regcall func(C6 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C6 @_Z16__regcall3__funcI2C6ET_S1_(%struct.B6 %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() <= 2 is passed as a single 16-bit integer + +struct CharChar { + char a; + char b; + + CharChar foo() { return *this; } +}; +// CHECK-DAG: %struct.CharChar = type { i8, i8 } + +template SYCL_DEVICE CharChar __regcall func(CharChar x); +// CHECK-DAG: define weak_odr x86_regcallcc i16 @_Z16__regcall3__funcI8CharCharET_S1_(i16 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() == 3-4 is passed as single 32-bit integer + +struct ShortShort { + short a; + short b; + + ShortShort foo() { return *this; } +}; +// CHECK-DAG: %struct.ShortShort = type { i16, i16 } + +template SYCL_DEVICE ShortShort __regcall func(ShortShort x); +// CHECK-DAG: define weak_odr x86_regcallcc i32 @_Z16__regcall3__funcI10ShortShortET_S1_(i32 %{{[0-9a-zA-Z_.]+}}) + +struct CharShort { + char a; + short b; + + CharShort foo() { return *this; } +}; +// CHECK-DAG: %struct.CharShort = type { i8, i16 } + +template SYCL_DEVICE CharShort __regcall func(CharShort x); +// CHECK-DAG: define weak_odr x86_regcallcc i32 @_Z16__regcall3__funcI9CharShortET_S1_(i32 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with primitive single field element is just unwrapped + +struct Char { + char a; + + Char foo() { return *this; } +}; +// CHECK-DAG: %struct.Char = type { i8 } + +template SYCL_DEVICE Char __regcall func(Char x); +// CHECK-DAG: define weak_odr x86_regcallcc i8 @_Z16__regcall3__funcI4CharET_S1_(i8 %{{[0-9a-zA-Z_.]+}}) + +struct Float { + float a; + + Float foo() { return *this; } +}; +// CHECK-DAG: %struct.Float = type { float } + +template SYCL_DEVICE Float __regcall func(Float x); +// CHECK-DAG: define weak_odr x86_regcallcc float @_Z16__regcall3__funcI5FloatET_S1_(float %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() == 5-8 is passed as two 32-bit integers +// 32-bit integers + +struct CharCharShortFloat { + char a, b; + short c; + float d; + + CharCharShortFloat foo() { return *this; } +}; +// CHECK-DAG: %struct.CharCharShortFloat = type { i8, i8, i16, float } + +template SYCL_DEVICE CharCharShortFloat __regcall func(CharCharShortFloat x); +// CHECK-DAG: define weak_odr x86_regcallcc [2 x i32] @_Z16__regcall3__funcI18CharCharShortFloatET_S1_([2 x i32] %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with some of the fields padded and sizeof() > 8 +// * when passed as argument, it is broken into constituents +// * is returned by value + +struct CharFloatCharShort { + char a; + float b; + char c; + short d; + + CharFloatCharShort foo() { return *this; } +}; + +// CHECK-DAG: %struct.CharFloatCharShort = type { i8, float, i8, i16 } + +template SYCL_DEVICE CharFloatCharShort __regcall func(CharFloatCharShort x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.CharFloatCharShort @_Z16__regcall3__funcI18CharFloatCharShortET_S1_(i8 %{{[0-9a-zA-Z_.]+}}, float %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}) + +struct CharDoubleCharLonglong { + char a; + double b; + char c; + long long d; + + CharDoubleCharLonglong foo() { return *this; } +}; + +// CHECK-DAG: %struct.CharDoubleCharLonglong = type { i8, double, i8, i64 } + +template SYCL_DEVICE CharDoubleCharLonglong __regcall func(CharDoubleCharLonglong x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.CharDoubleCharLonglong @_Z16__regcall3__funcI22CharDoubleCharLonglongET_S1_(i8 %{{[0-9a-zA-Z_.]+}}, double %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}) + + +// === TEST CASE: a struct of 130x4-byte elements is still passed by value + +struct StillPassThroughRegisters { + // 130 total: + int a, a01, a02, a03, a04, a05, a06, a07, a08, a09, + a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, + a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, + a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, + a40, a41, a42, a43, a44, a45, a46, a47, a48, a49, + a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, + a60, a61, a62, a63, a64, a65, a66, a67, a68, a69, + a70, a71, a72, a73, a74, a75, a76, a77, a78, a79, + a80, a81, a82, a83, a84, a85, a86, a87, a88, a89, + a90, a91, a92, a93, a94, a95, a96, a97, a98, a99, + aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7, aa8, aa9, + ab0, ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8, ab9, + ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7, ac8, ac9; + + StillPassThroughRegisters foo() { return *this; } +}; +// CHECK-DAG: %struct.StillPassThroughRegisters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } + + +template SYCL_DEVICE StillPassThroughRegisters __regcall func(StillPassThroughRegisters x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.StillPassThroughRegisters @_Z16__regcall3__funcI25StillPassThroughRegistersET_S1_(i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: class with multiple inheritance is passed by value + +class Class0 { int x0; }; +class Class1 { int x1; }; +class ClassX : public Class0, public Class1 { int x; }; +class ClassY { int y; }; +class ClassXY : public ClassX, public ClassY { + int xy; +public: + ClassXY foo() { return *this; } +}; +// CHECK-DAG: %class.ClassXY = type { %class.ClassX, %class.ClassY, i32 } +// CHECK-DAG: %class.ClassX = type { %class.Class0, %class.Class1, i32 } +// CHECK-DAG: %class.Class0 = type { i32 } +// CHECK-DAG: %class.Class1 = type { i32 } +// CHECK-DAG: %class.ClassY = type { i32 } + +template SYCL_DEVICE ClassXY __regcall func(ClassXY x); +// CHECK-DAG: define weak_odr x86_regcallcc %class.ClassXY @_Z16__regcall3__funcI7ClassXYET_S1_(%class.ClassX %{{[0-9a-zA-Z_.]+}}, %class.ClassY %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + +// ------------------- Negative test cases (pass via memory) + +// === TEST CASE: no copy constructor -> pass by pointer +struct NonCopyable { + NonCopyable(int a) : a(a) {} + NonCopyable(const NonCopyable&) = delete; + int a; +}; +// CHECK-DAG: %struct.NonCopyable = type { i32 } + +SYCL_DEVICE int __regcall bar(NonCopyable x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar11NonCopyable(%struct.NonCopyable* noundef %x) + return x.a; +} + +// === TEST CASE: empty struct -> optimize out +struct Empty {}; +// CHECK-DAG: %struct.Empty = type + +SYCL_DEVICE int __regcall bar(Empty x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar5Empty() + return 10; +} + +// === TEST CASE: struct ends with flexible array -> pass by pointer +struct EndsWithFlexArray { + int a; + int x[]; +}; +// CHECK-DAG: %struct.EndsWithFlexArray = type { i32, [0 x i32] } + +SYCL_DEVICE int __regcall bar(EndsWithFlexArray x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar17EndsWithFlexArray(%struct.EndsWithFlexArray* noundef byval(%struct.EndsWithFlexArray) align 4 %x) + return x.a; +} diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index 3d42b34d2596e..c7ed61a718a00 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -106,7 +106,7 @@ device-side API - `sycl::accessor::get_pointer()`. All memory accesses through an accessor are done via explicit APIs; e.g. `sycl::ext::intel::experimental::esimd::block_store(acc, offset)` - Accessors with offsets and/or access range specified -- `sycl::sampler` and `sycl::stream` classes +- `sycl::sampler` and `sycl::stream` classes ## Core Explicit SIMD programming APIs @@ -511,6 +511,114 @@ ESIMD_PRIVATE ESIMD_REGISTER(32) simd vc; ```
+### `__regcall` Calling convention. + +ESIMD supports `__regcall` calling convention (CC) in addition to the default +SPIR CC. This makes compiler try generate more efficient calls where arguments +of aggregate types (classes, structs, unions) are passed and values returned via +registers rather than memory. This matters most for external functions linked on +binary level, such as functions called via `invoke_simd`. Arguments and return +values ("ARV") are still passed or returned ("communicated") via a pointer if +their type is either of the following: +- a class or struct with deleted copy constructor +- an empty class or struct +- a class or struct ending with a flexible array member. For example: +`class A { int x[]; }` + +ARVs of all other aggregate types are communicated by value or "per-field". Some +fields can be replaced with 1 or 2 integer elements with total size being equal +or exceeding the total size of fields. The rules for communicating ARVs of these +types are part of the SPIR-V level function call ABI, and are described below. +This part of the ABI is defined in terms of LLVM IR types - it basically +tells how a specific source aggregate type is represented in resulting LLVM IR +when it (the type) is part of a signature of a function with linkage defined. + +Compiler uses aggregate type "unwrapping process" for communicating ARVs. +Unwarapping a structure with a single field results in the unwrapped type of +that field, so unwrapping is a recursive process. Unwrapped primitive type is +the primitive type itself. Structures with pointer fields are not unwrapped. +For example, unwrapping `Y` defined as +```cpp +struct X { int x; }; +struct Y { X x; }; +``` +results in `i32`. Unwrapping `C4` defind as +```cpp +struct A4 { char x; }; +struct B4 { A4 a; }; +struct C4 { + B4 b; + int *ptr; +}; +``` +results in { `%struct.B4`, `i32 addrspace(4)*` } pair of types. Thus, +unwraping can result in a set of a structure, primitive or pointer types - +the "unwrapped type set". + +- If the unwrapped type set has only primitive types, then compiler will "merge" + the resulting types if their total size is less or equal to 8 bytes. The total + size is calculated as `sizeof()`, and structure field + alignment rules can make it greater then the simple sum of `sizeof` of all + the types resulted from unwrapping. [Total size] to [merged type] + correspondence is as follows: + * 1-2 bytes - short + * 3-4 bytes - int + * 5-8 bytes - array of 2 ints + Floating point types are not merged. Structure field alignment rules can + increase the calculated size compared to simple sum of `sizeof` of all the + types. If the total size exceeds 8, then: + * a source parameter of this type is broken down into multiple parameters + with types resulted from unwrapping + * a source return value of this type keeps it (the type) +- If the unwrapped type set has non-primitive types, then merging does not + happen, in this case unwrapping for the return value does not happen as well. + +More examples of the unwrap/merge process: + +- For `C5` in + ```cpp + struct A5a { char x; char y; }; + struct A5b { char x; char y; }; + struct B5 { A5a a; A5b b; }; + struct C5 { + B5 b1; + B5 b2; + }; + ``` + The result is `[2 x i32]`. It is not `i32` because of padding rules, as + sizeof(C5) is 8 for the SPIRV target. +- For `C6` + ```cpp + struct B6 { int *a; int b; }; + struct C6 { + B6 b; + char x; + char y; + + C6 foo() { return *this; } + }; + ``` + the result depends whether this is a type of an argument or a return value. + * Argument: { `%struct.B6`, `i8`, `i8` } type set + * Return value: `%struct.C6` type. Where the struct LLVM types are defined + as: + ``` + %struct.C6 = type { %struct.B6, i8, i8 } + %struct.B6 = type { i32 addrspace(4)*, i32 } + ``` + +Note that `__regcall` does not guarantee passing through registers in the final +generated code. For example, compiler will use a threshold for argument or +return value size, which is implementation-defined. Values larger than the +threshold will still be passed by pointer (memory). + +Example declaration of a `__regcall` function: +```cpp +simd __regcall SCALE(simd v); +``` +The parameter and the return type in the ABI form will be `<8 x float>`. +
+ ## Examples ### Vector addition (USM) ```cpp From 8e942deb1f84143d14927a72ffbab6130559391f Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Fri, 11 Mar 2022 21:49:14 -0800 Subject: [PATCH 2/3] Address review comments Signed-off-by: Konstantin S Bobrovsky --- clang/lib/Basic/Targets/SPIR.h | 7 +++---- clang/test/CodeGenSYCL/regcall-cc-test.cpp | 9 ++++----- .../sycl_ext_intel_esimd/sycl_ext_intel_esimd.md | 6 ++---- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index bd483e939485c..70874227cf939 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -130,9 +130,8 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo { CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { return (CC == CC_SpirFunction || CC == CC_OpenCLKernel || // Permit CC_X86RegCall which is used to mark external functions - // with - // explicit simd or structure type arguments to pass them via - // registers. + // with explicit simd or structure type arguments to pass them via + // registers. CC == CC_X86RegCall) ? CCCR_OK : CCCR_Warning; @@ -295,7 +294,7 @@ class LLVM_LIBRARY_VISIBILITY WindowsX86_64_SPIR64TargetInfo if (CC == CC_X86VectorCall || CC == CC_X86RegCall) // Permit CC_X86VectorCall which is used in Microsoft headers // Permit CC_X86RegCall which is used to mark external functions with - // explicit simd or structure type arguments to pass them via registers. + // explicit simd or structure type arguments to pass them via registers. return CCCR_OK; return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK : CCCR_Warning; diff --git a/clang/test/CodeGenSYCL/regcall-cc-test.cpp b/clang/test/CodeGenSYCL/regcall-cc-test.cpp index 371c702f10e76..4d0118fcd8714 100644 --- a/clang/test/CodeGenSYCL/regcall-cc-test.cpp +++ b/clang/test/CodeGenSYCL/regcall-cc-test.cpp @@ -91,7 +91,7 @@ template SYCL_DEVICE C2 __regcall func(C2 x); // === TEST CASE: multi-level nested structs with one primitive type element at // the bottom, and one - at the top. The nested struct at the top is expected to -// get "unwraped" by the compiler evaporating to the single element at the +// get "unwrapped" by the compiler evaporating to the single element at the // bottom. struct A3 { char x; }; @@ -110,10 +110,9 @@ struct C3 { // unwrapped template SYCL_DEVICE C3 __regcall func(C3 x); // CHECK-DAG: define weak_odr x86_regcallcc i16 @_Z16__regcall3__funcI2C3ET_S1_(i16 %{{[0-9a-zA-Z_.]+}}) -// === TEST CASE: multi-level nested structs with one primitive type element at -// the bottom, and one - at the top. The nested struct at the top is expected to -// get "unwraped" by the compiler evaporating to the single element at the -// bottom. +// === TEST CASE: multi-level nested structs with a pointer field at the top +// level. 1 step-deep unwrapping for a function argument type and no unwrapping +// for the return type is expected to happen. struct A4 { char x; }; struct B4 { A4 a; }; diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index c7ed61a718a00..b2fbfafe7a43a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -542,7 +542,7 @@ For example, unwrapping `Y` defined as struct X { int x; }; struct Y { X x; }; ``` -results in `i32`. Unwrapping `C4` defind as +results in `i32`. Unwrapping `C4` defined as ```cpp struct A4 { char x; }; struct B4 { A4 a; }; @@ -564,9 +564,7 @@ the "unwrapped type set". * 1-2 bytes - short * 3-4 bytes - int * 5-8 bytes - array of 2 ints - Floating point types are not merged. Structure field alignment rules can - increase the calculated size compared to simple sum of `sizeof` of all the - types. If the total size exceeds 8, then: + If the total size exceeds 8, then: * a source parameter of this type is broken down into multiple parameters with types resulted from unwrapping * a source return value of this type keeps it (the type) From 25682398c7536977f1e1ee7eae4b5a61313b8ddb Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Mon, 14 Mar 2022 15:45:51 -0700 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Aaron Ballman Co-authored-by: premanandrao --- clang/lib/CodeGen/TargetInfo.cpp | 2 +- .../sycl_ext_intel_esimd/sycl_ext_intel_esimd.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 3fc405e316b0e..d5a5d843384ee 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -10286,7 +10286,7 @@ class CommonSPIRABIInfo : public DefaultABIInfo { ABIArgInfo classifyKernelArgumentType(QualType Ty) const; - // Add new functions rather then overload existing so that these public APIs + // Add new functions rather than overload existing so that these public APIs // can't be blindly misused with wrong calling convention. ABIArgInfo classifyRegcallReturnType(QualType RetTy) const; ABIArgInfo classifyRegcallArgumentType(QualType RetTy) const; diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index b2fbfafe7a43a..e599f6138f205 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -534,7 +534,7 @@ tells how a specific source aggregate type is represented in resulting LLVM IR when it (the type) is part of a signature of a function with linkage defined. Compiler uses aggregate type "unwrapping process" for communicating ARVs. -Unwarapping a structure with a single field results in the unwrapped type of +Unwrapping a structure with a single field results in the unwrapped type of that field, so unwrapping is a recursive process. Unwrapped primitive type is the primitive type itself. Structures with pointer fields are not unwrapped. For example, unwrapping `Y` defined as @@ -552,13 +552,13 @@ struct C4 { }; ``` results in { `%struct.B4`, `i32 addrspace(4)*` } pair of types. Thus, -unwraping can result in a set of a structure, primitive or pointer types - +unwrapping can result in a set of a structure, primitive or pointer types - the "unwrapped type set". - If the unwrapped type set has only primitive types, then compiler will "merge" - the resulting types if their total size is less or equal to 8 bytes. The total + the resulting types if their total size is less than or equal to 8 bytes. The total size is calculated as `sizeof()`, and structure field - alignment rules can make it greater then the simple sum of `sizeof` of all + alignment rules can make it greater than the simple sum of `sizeof` of all the types resulted from unwrapping. [Total size] to [merged type] correspondence is as follows: * 1-2 bytes - short