From 26e493f17b884533b0e7e7cdfc39eccfd0e4241a Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Tue, 30 Jun 2020 22:53:46 -0700 Subject: [PATCH 1/7] Add workaround for MSVC++ 2017 bug to ItaniumDemangle.h. This workaround is to avoid compilation error when building on Windows with MSVC++ 2017. Signed-off-by: Konstantin S Bobrovsky --- llvm/include/llvm/Demangle/ItaniumDemangle.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h index a5847e673ea11..0e4af0ad8a6a4 100644 --- a/llvm/include/llvm/Demangle/ItaniumDemangle.h +++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h @@ -445,6 +445,14 @@ class EnableIfAttr : public Node { } }; +#ifdef _MSC_VER +// Workaround for MSVC++ bug (Version 2017, 15.8.9) - w/o this forward +// declaration, the friend declaration in ObjCProtoName below has no effect +// and leads to compilation error when ObjCProtoName::Protocol private field +// is accessed in PointerType::printLeft. +class PointerType; +#endif // _MSC_VER + class ObjCProtoName : public Node { const Node *Ty; StringView Protocol; From c59727636782a30a277b06d18b81d899daab9329 Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Fri, 12 Jun 2020 11:37:53 -0700 Subject: [PATCH 2/7] [SYCL][ESIMD] Implement IR pass to lower C++ ESIMD intrinsics. The pass transforms *__esimd_* Itanium - mangled C++ intrinsics to genx.*style parseable by the ESIMD - capable SPIRV translator. Authors: Konstantin S Bobrovsky Gang Chen Wei Pan Denis Bakhvalov Anton Sidorenko Kaiyu Chen Pratik Ashar Signed-off-by: Konstantin S Bobrovsky --- llvm/include/llvm/InitializePasses.h | 1 + llvm/include/llvm/LinkAllPasses.h | 2 + llvm/include/llvm/SYCLLowerIR/LowerESIMD.h | 39 + llvm/lib/SYCLLowerIR/CMakeLists.txt | 15 + llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 1316 ++++++++++++++++++ llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 198 +++ llvm/tools/opt/opt.cpp | 1 + 7 files changed, 1572 insertions(+) create mode 100644 llvm/include/llvm/SYCLLowerIR/LowerESIMD.h create mode 100644 llvm/lib/SYCLLowerIR/LowerESIMD.cpp create mode 100644 llvm/test/SYCLLowerIR/esimd_lower_intrins.ll diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index b35bd9be861c6..cc8aff352a9e4 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -414,6 +414,7 @@ void initializeStripNonLineTableDebugInfoPass(PassRegistry&); void initializeStripSymbolsPass(PassRegistry&); void initializeStructurizeCFGPass(PassRegistry&); void initializeSYCLLowerWGScopeLegacyPassPass(PassRegistry &); +void initializeSYCLLowerESIMDLegacyPassPass(PassRegistry &); void initializeTailCallElimPass(PassRegistry&); void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 09c930fb174f6..3f7d22977d7c7 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -37,6 +37,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/SYCLLowerIR/LowerESIMD.h" #include "llvm/SYCLLowerIR/LowerWGScope.h" #include "llvm/Support/Valgrind.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" @@ -201,6 +202,7 @@ namespace { (void) llvm::createMergeICmpsLegacyPass(); (void) llvm::createExpandMemCmpPass(); (void)llvm::createSYCLLowerWGScopePass(); + (void)llvm::createSYCLLowerESIMDPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h new file mode 100644 index 0000000000000..2141b65c5895e --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h @@ -0,0 +1,39 @@ +//===-- LowerESIMD.cpp - lower Explicit SIMD (ESIMD) constructs -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lowers CM-specific LLVM IR constructs coming out of the front-end. These are: +// - ESIMD intrinsics, e.g.: +// template +// sycl::intel::gpu::vector_type_t +// __esimd_rdregion(sycl::intel::gpu::vector_type_t Input, +// uint16_t Offset); +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SYCLLOWERIR_LOWERESIMD_H +#define LLVM_SYCLLOWERIR_LOWERESIMD_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// SPIRV (ESIMD) target specific pass to transform ESIMD specific constructs +/// like intrinsics to a form parsable by the ESIMD-aware SPIRV translator. +class SYCLLowerESIMDPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &, + SmallPtrSet &GVTS); +}; + +FunctionPass *createSYCLLowerESIMDPass(); +void initializeSYCLLowerESIMDLegacyPassPass(PassRegistry &); + +} // namespace llvm + +#endif // LLVM_SYCLLOWERIR_LOWERESIMD_H diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 7a327d7657b69..03b507bafc9ed 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -1,9 +1,24 @@ add_llvm_component_library(LLVMSYCLLowerIR LowerWGScope.cpp + LowerESIMD.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR + ${LLVM_MAIN_SRC_DIR}/projects/vc-intrinsics/GenXIntrinsics/include + ${LLVM_BINARY_DIR}/projects/vc-intrinsics/GenXIntrinsics/include DEPENDS intrinsics_gen + LLVMGenXIntrinsics + LLVMDemangle + LLVMTransformUtils + + LINK_LIBS + LLVMGenXIntrinsics + LLVMDemangle + LLVMTransformUtils ) + +target_include_directories(LLVMSYCLLowerIR + PRIVATE ${LLVM_MAIN_SRC_DIR}/projects/vc-intrinsics/GenXIntrinsics/include + PRIVATE ${LLVM_BINARY_DIR}/projects/vc-intrinsics/GenXIntrinsics/include) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp new file mode 100644 index 0000000000000..c9e760359a83f --- /dev/null +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -0,0 +1,1316 @@ +//===-- LowerESIMD.cpp - lower Explicit SIMD (ESIMD) constructs -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See intro comments in the header. +// +// Since the spir* targets use Itanium mangling for C/C++ symbols, the +// implementation uses the Itanium demangler to demangle device code's +// C++ intrinsics and access various information, such their C++ names and +// values of integer template parameters they were instantiated with. +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/LowerESIMD.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/Demangle/ItaniumDemangle.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include +#include +#include + +using namespace llvm; +namespace id = itanium_demangle; + +#define DEBUG_TYPE "lower-esimd" + +#define SLM_BTI 254 + +namespace { +class SYCLLowerESIMDLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + SYCLLowerESIMDLegacyPass() : FunctionPass(ID) { + initializeSYCLLowerESIMDLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + // run the LowerESIMD pass on the specified module + bool runOnFunction(Function &F) override { + FunctionAnalysisManager FAM; + auto PA = Impl.run(F, FAM, GenXVolatileTypeSet); + return !PA.areAllPreserved(); + } + + bool doInitialization(Module &M) override { + // emit ESIMD backend compatible metadata. + generateKernelMetadata(M); + collectGenXVolatileType(M); + return false; + } + +private: + SYCLLowerESIMDPass Impl; + SmallPtrSet GenXVolatileTypeSet; + void generateKernelMetadata(Module &M); + void collectGenXVolatileType(Module &M); +}; +} // namespace + +char SYCLLowerESIMDLegacyPass::ID = 0; +INITIALIZE_PASS(SYCLLowerESIMDLegacyPass, "LowerESIMD", + "Lower constructs specific to Close To Metal", false, false) + +// Public interface to the SYCLLowerESIMDPass. +FunctionPass *llvm::createSYCLLowerESIMDPass() { + return new SYCLLowerESIMDLegacyPass(); +} + +namespace { +// The regexp for ESIMD intrinsics: +// /^_Z(\d+)__esimd_\w+/ +static constexpr char ESIMD_INTRIN_PREF0[] = "_Z"; +static constexpr char ESIMD_INTRIN_PREF1[] = "__esimd_"; +static constexpr char SPIRV_INTRIN_PREF[] = "__spirv_"; + +static constexpr char GENX_KERNEL_METADATA[] = "genx.kernels"; + +struct ESIMDIntrinDesc { + // Denotes argument translation rule kind. + enum GenXArgRuleKind { + SRC_CALL_ARG, // is a call argument + SRC_CALL_ALL, // this and subsequent args are just copied from the src call + SRC_TMPL_ARG, // is an integer template argument + NUM_BYTES, // is a number of bytes (gather.scaled and scatter.scaled) + UNDEF, // is an undef value + CONST_INT16, // is an i16 constant + CONST_INT32, // is an i32 constant + CONST_INT64, // is an i64 constant + }; + + enum GenXArgConversion { + NONE, // no conversion + TO_I1, // convert vector of N-bit integer to 1-bit + TO_SI // convert to 32-bit integer surface index + }; + + // Denotes GenX intrinsic name suffix creation rule kind. + enum GenXSuffixRuleKind { + NO_RULE, + BIN_OP, // "." - e.g. "*.add" + NUM_KIND // "" - e.g. "*i" for integer, "*f" for float + }; + + // Represents a rule how a GenX intrinsic argument is created from the source + // call instruction. + struct ArgRule { + GenXArgRuleKind Kind; + union Info { + struct { + int16_t CallArgNo; // SRC_CALL_ARG: source call arg num + // UNDEF: source call arg num to get type from + // -1 denotes return value + int16_t Conv; // GenXArgConversion + } Arg; + int NRemArgs; // SRC_CALL_ALL: number of remaining args + unsigned int TmplArgNo; // SRC_TMPL_ARG: source template arg num + unsigned int ArgConst; // CONST_I16 OR CONST_I32: constant value + } I; + }; + + // Represents a rule how a GenX intrinsic name suffix is created from the + // source call instruction. + struct NameRule { + GenXSuffixRuleKind Kind; + union Info { + int CallArgNo; // DATA_TYPE: source call arg num to get type from + int TmplArgNo; // BINOP: source template arg num denoting the binary op + } I; + }; + + std::string GenXSpelling; + SmallVector ArgRules; + NameRule SuffixRule = {NO_RULE, 0}; + + int getNumGenXArgs() const { + auto NRules = ArgRules.size(); + + if (NRules == 0) + return 0; + + // SRC_CALL_ALL is a "shortcut" to save typing, must be the last rule + if (ArgRules[NRules - 1].Kind == GenXArgRuleKind::SRC_CALL_ALL) + return ArgRules[NRules - 1].I.NRemArgs + (NRules - 1); + return NRules; + } + + bool isValid() const { return !GenXSpelling.empty(); } +}; + +using IntrinTable = std::unordered_map; + +class ESIMDIntrinDescTable { +private: + IntrinTable Table; + +#define DEF_ARG_RULE(Nm, Kind) \ + static constexpr ESIMDIntrinDesc::ArgRule Nm(int16_t N) { \ + return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::Kind, N}; \ + } + DEF_ARG_RULE(l, SRC_CALL_ALL) + DEF_ARG_RULE(t, SRC_TMPL_ARG) + DEF_ARG_RULE(u, UNDEF) + DEF_ARG_RULE(nbs, NUM_BYTES) + + static constexpr ESIMDIntrinDesc::ArgRule a(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ + ESIMDIntrinDesc::SRC_CALL_ARG, + {N, ESIMDIntrinDesc::GenXArgConversion::NONE}}; + } + + static constexpr ESIMDIntrinDesc::ArgRule ai1(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ + ESIMDIntrinDesc::SRC_CALL_ARG, + {N, ESIMDIntrinDesc::GenXArgConversion::TO_I1}}; + } + + static constexpr ESIMDIntrinDesc::ArgRule aSI(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ + ESIMDIntrinDesc::SRC_CALL_ARG, + {N, ESIMDIntrinDesc::GenXArgConversion::TO_SI}}; + } + + static constexpr ESIMDIntrinDesc::ArgRule c16(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::CONST_INT16, N}; + } + + static constexpr ESIMDIntrinDesc::ArgRule c32(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::CONST_INT32, N}; + } + + static constexpr ESIMDIntrinDesc::ArgRule c64(int16_t N) { + return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::CONST_INT64, N}; + } + + static constexpr ESIMDIntrinDesc::NameRule bo(int16_t N) { + return ESIMDIntrinDesc::NameRule{ESIMDIntrinDesc::BIN_OP, N}; + } + + static constexpr ESIMDIntrinDesc::NameRule nk(int16_t N) { + return ESIMDIntrinDesc::NameRule{ESIMDIntrinDesc::NUM_KIND, N}; + } + +public: + ESIMDIntrinDescTable() { + Table = { + // An element of the table is std::pair of ; key is the + // source + // spelling of and intrinsic (what follows the "__esimd_" prefix), and + // the + // value is an instance of the ESIMDIntrinDesc class. + // Example for the "rdregion" intrinsic encoding: + // "rdregion" - the GenX spelling of the intrinsic ("llvm.genx." prefix + // and type suffixes maybe added to get full GenX name) + // {a(0), t(3),...} + // defines a map from the resulting genx.* intrinsic call arguments + // to the source call's template or function call arguments, e.g. + // 0th genx arg - maps to 0th source call arg + // 1st genx arg - maps to 3rd template argument of the source call + // nk(N) or bo(N) + // a rule applied to the base intrinsic name in order to + // construct a full name ("llvm.genx." prefix s also added); e.g. + // - nk(-1) denotes adding the return type name-based suffix - "i" + // for integer, "f" - for floating point + {"rdregion", + {"rdregion", {a(0), t(3), t(4), t(5), a(1), t(6)}, nk(-1)}}, + {{"wrregion"}, + {{"wrregion"}, + {a(0), a(1), t(3), t(4), t(5), a(2), t(6), ai1(3)}, + nk(-1)}}, + {"vload", {"vload", {l(0)}}}, + {"vstore", {"vstore", {a(1), a(0)}}}, + + {"flat_block_read_unaligned", {"svm.block.ld.unaligned", {l(0)}}}, + {"flat_block_write", {"svm.block.st", {l(1)}}}, + {"flat_read", {"svm.gather", {ai1(2), a(1), a(0), u(-1)}}}, + {"flat_read4", + {"svm.gather4.scaled", {ai1(1), t(2), c16(0), c64(0), a(0), u(-1)}}}, + {"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}}, + {"flat_write4", + {"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}}, + // intrinsics to query thread's coordinates: + {"group_id_x", {"group.id.x", {}}}, + {"group_id_y", {"group.id.y", {}}}, + {"group_id_z", {"group.id.z", {}}}, + {"local_id", {"local.id", {}}}, + {"local_size", {"local.size", {}}}, + {"flat_atomic0", {"svm.atomic", {ai1(1), a(0), u(-1)}, bo(0)}}, + {"flat_atomic1", {"svm.atomic", {ai1(2), a(0), a(1), u(-1)}, bo(0)}}, + {"flat_atomic2", + {"svm.atomic", {ai1(3), a(0), a(1), a(2), u(-1)}, bo(0)}}, + {"reduced_fmax", {"fmax", {a(0), a(1)}}}, + {"reduced_umax", {"umax", {a(0), a(1)}}}, + {"reduced_smax", {"smax", {a(0), a(1)}}}, + {"reduced_fmin", {"fmin", {a(0), a(1)}}}, + {"reduced_umin", {"umin", {a(0), a(1)}}}, + {"reduced_smin", {"smin", {a(0), a(1)}}}, + {"dp4", {"dp4", {a(0), a(1)}}}, + // 2nd argumnent of media.* is a surface index - + // it is produced by casting and truncating the OpenCL opaque image + // pointer + // source media_block* intrinsic argument; this is according the the + // OpenCL runtime - JIT compiler handshake protocol for OpenCL images. + {"media_block_load", + {"media.ld", {a(0), aSI(1), a(2), a(3), a(4), a(5)}}}, + {"media_block_store", + {"media.st", {a(0), aSI(1), a(2), a(3), a(4), a(5), a(6)}}}, + {"slm_fence", {"fence", {a(0)}}}, + {"barrier", {"barrier", {}}}, + {"block_read", {"oword.ld.unaligned", {c32(0), aSI(0), a(1)}}}, + {"block_write", {"oword.st", {aSI(0), a(1), a(2)}}}, + {"slm_block_read", + {"oword.ld.unaligned", {c32(0), c32(SLM_BTI), a(0)}}}, + {"slm_block_write", {"oword.st", {c32(SLM_BTI), a(0), a(1)}}}, + {"slm_read", + {"gather.scaled", + {ai1(1), nbs(-1), c16(0), c32(SLM_BTI), c32(0), a(0), u(-1)}}}, + {"slm_read4", + {"gather4.scaled", + {ai1(1), t(2), c16(0), c32(SLM_BTI), c32(0), a(0), u(-1)}}}, + {"slm_write", + {"scatter.scaled", + {ai1(2), nbs(1), c16(0), c32(SLM_BTI), c32(0), a(0), a(1)}}}, + {"slm_write4", + {"scatter4.scaled", + {ai1(2), t(2), c16(0), c32(SLM_BTI), c32(0), a(0), a(1)}}}, + {"slm_atomic0", + {"dword.atomic", {ai1(1), c32(SLM_BTI), a(0), u(-1)}, bo(0)}}, + {"slm_atomic1", + {"dword.atomic", {ai1(2), c32(SLM_BTI), a(0), a(1), u(-1)}, bo(0)}}, + {"slm_atomic2", + {"dword.atomic", + {ai1(3), c32(SLM_BTI), a(0), a(1), a(2), u(-1)}, + bo(0)}}, + {"raw_sends_load", + {"raw.sends2", + {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9), a(10), + a(11)}}}, + {"raw_send_load", + {"raw.send2", + {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9)}}}, + {"raw_sends_store", + {"raw.sends2.noresult", + {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9)}}}, + {"raw_send_store", + {"raw.send2.noresult", + {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7)}}}, + {"satf", {"sat", {a(0)}}}, + {"fptoui_sat", {"fptoui.sat", {a(0)}}}, + {"fptosi_sat", {"fptosi.sat", {a(0)}}}, + {"uutrunc_sat", {"uutrunc.sat", {a(0)}}}, + {"ustrunc_sat", {"ustrunc.sat", {a(0)}}}, + {"sutrunc_sat", {"sutrunc.sat", {a(0)}}}, + {"sstrunc_sat", {"sstrunc.sat", {a(0)}}}, + {"abs", {"abs", {a(0)}, nk(-1)}}, + {"ssshl", {"ssshl", {a(0), a(1)}}}, + {"sushl", {"sushl", {a(0), a(1)}}}, + {"usshl", {"usshl", {a(0), a(1)}}}, + {"uushl", {"uushl", {a(0), a(1)}}}, + {"ssshl_sat", {"ssshl.sat", {a(0), a(1)}}}, + {"sushl_sat", {"sushl.sat", {a(0), a(1)}}}, + {"usshl_sat", {"usshl.sat", {a(0), a(1)}}}, + {"uushl_sat", {"uushl.sat", {a(0), a(1)}}}, + {"rol", {"rol", {a(0), a(1)}}}, + {"ror", {"ror", {a(0), a(1)}}}, + {"umulh", {"umulh", {a(0), a(1)}}}, + {"smulh", {"smulh", {a(0), a(1)}}}, + {"frc", {"frc", {a(0)}}}, + {"fmax", {"fmax", {a(0), a(1)}}}, + {"umax", {"umax", {a(0), a(1)}}}, + {"smax", {"smax", {a(0), a(1)}}}, + {"lzd", {"lzd", {a(0)}}}, + {"fmin", {"fmin", {a(0), a(1)}}}, + {"umin", {"umin", {a(0), a(1)}}}, + {"smin", {"smin", {a(0), a(1)}}}, + {"bfrev", {"bfrev", {a(0)}}}, + {"cbit", {"cbit", {a(0)}}}, + {"bfins", {"bfi", {a(0), a(1), a(2), a(3)}}}, + {"bfext", {"sbfe", {a(0), a(1), a(2)}}}, + {"fbl", {"fbl", {a(0)}}}, + {"sfbh", {"sfbh", {a(0)}}}, + {"ufbh", {"ufbh", {a(0)}}}, + {"inv", {"inv", {a(0)}}}, + {"log", {"log", {a(0)}}}, + {"exp", {"exp", {a(0)}}}, + {"sqrt", {"sqrt", {a(0)}}}, + {"sqrt_ieee", {"ieee.sqrt", {a(0)}}}, + {"rsqrt", {"rsqrt", {a(0)}}}, + {"sin", {"sin", {a(0)}}}, + {"cos", {"cos", {a(0)}}}, + {"pow", {"pow", {a(0), a(1)}}}, + {"div_ieee", {"ieee.div", {a(0), a(1)}}}, + {"dp4a", {"dp4a", {a(0), a(1), a(2)}}}, + {"any", {"any", {ai1(0)}}}, + {"all", {"all", {ai1(0)}}}, + }; + } + + const IntrinTable &getTable() { return Table; } +}; + +// The C++11 "magic static" idiom to lazily initialize the ESIMD intrinsic table +static const IntrinTable &getIntrinTable() { + static ESIMDIntrinDescTable TheTable; + return TheTable.getTable(); +} + +static const ESIMDIntrinDesc &getIntrinDesc(StringRef SrcSpelling) { + static ESIMDIntrinDesc InvalidDesc{"", {}, {}}; + const auto &Table = getIntrinTable(); + auto It = Table.find(SrcSpelling.str()); + + if (It == Table.end()) { + Twine Msg("unknown ESIMD intrinsic: " + SrcSpelling); + + llvm::errs() << Msg << "\n"; + // TODO warning message for now, to enable compiling tests with intrinsics + // that are not implemented yet + // llvm::report_fatal_error(Msg, false/*no crash diag*/); + return InvalidDesc; + } + return It->second; +} + +// Simplest possible implementation of an allocator for the Itanium demangler +class SimpleAllocator { +protected: + SmallVector Ptrs; + +public: + void reset() { + for (void *Ptr : Ptrs) { + // Destructors are not called, but that is OK for the + // itanium_demangle::Node subclasses + std::free(Ptr); + } + Ptrs.resize(0); + } + + template T *makeNode(Args &&... args) { + void *Ptr = std::calloc(1, sizeof(T)); + Ptrs.push_back(Ptr); + return new (Ptr) T(std::forward(args)...); + } + + void *allocateNodeArray(size_t sz) { + void *Ptr = std::calloc(sz, sizeof(id::Node *)); + Ptrs.push_back(Ptr); + return Ptr; + } + + ~SimpleAllocator() { reset(); } +}; + +Type *parsePrimitiveTypeString(StringRef TyStr, LLVMContext &Ctx) { + return llvm::StringSwitch(TyStr) + .Case("bool", IntegerType::getInt1Ty(Ctx)) + .Case("char", IntegerType::getInt8Ty(Ctx)) + .Case("unsigned char", IntegerType::getInt8Ty(Ctx)) + .Case("short", IntegerType::getInt16Ty(Ctx)) + .Case("unsigned short", IntegerType::getInt16Ty(Ctx)) + .Case("int", IntegerType::getInt32Ty(Ctx)) + .Case("unsigned int", IntegerType::getInt32Ty(Ctx)) + .Case("unsigned", IntegerType::getInt32Ty(Ctx)) + .Case("unsigned long long", IntegerType::getInt64Ty(Ctx)) + .Case("long long", IntegerType::getInt64Ty(Ctx)) + .Case("float", IntegerType::getFloatTy(Ctx)) + .Case("double", IntegerType::getDoubleTy(Ctx)) + .Case("void", IntegerType::getVoidTy(Ctx)) + .Case("", nullptr) + .Default(nullptr); +} + +template +static const T *castNodeImpl(const id::Node *N, id::Node::Kind K) { + assert(N && N->getKind() == K && "unexpected demangler node kind"); + return reinterpret_cast(N); +} + +#define castNode(NodeObj, NodeKind) \ + castNodeImpl(NodeObj, id::Node::K##NodeKind) + +static APInt parseTemplateArg(id::FunctionEncoding *FE, unsigned int N, + Type *&Ty, LLVMContext &Ctx) { + auto *Nm = castNode(FE->getName(), NameWithTemplateArgs); + auto *ArgsN = castNode(Nm->TemplateArgs, TemplateArgs); + id::NodeArray Args = ArgsN->getParams(); + assert(N < Args.size() && "too few template arguments"); + id::StringView Val; + + switch (Args[N]->getKind()) { + case id::Node::KIntegerLiteral: { + auto *ValL = castNode(Args[N], IntegerLiteral); + const id::StringView &TyStr = ValL->getType(); + Ty = TyStr.size() == 0 ? IntegerType::getInt32Ty(Ctx) + : parsePrimitiveTypeString( + StringRef(TyStr.begin(), TyStr.size()), Ctx); + Val = ValL->getValue(); + break; + } + case id::Node::KEnumLiteral: { + auto *CE = castNode(Args[N], EnumLiteral); + Ty = IntegerType::getInt32Ty(Ctx); + Val = CE->getIntegerValue(); + break; + } + default: + llvm_unreachable_internal("bad esimd intrinsic template parameter"); + } + return APInt(Ty->getPrimitiveSizeInBits(), StringRef(Val.begin(), Val.size()), + 10); +} + +// Constructs a GenX intrinsic name suffix based on the original C++ name (stem) +// and the types of its parameters (some intrinsic names have additional +// suffixes depending on the parameter types). +static std::string getESIMDIntrinSuffix(id::FunctionEncoding *FE, + FunctionType *FT, + const ESIMDIntrinDesc::NameRule &Rule) { + std::string Suff; + switch (Rule.Kind) { + case ESIMDIntrinDesc::GenXSuffixRuleKind::BIN_OP: { + // e.g. ".add" + Type *Ty = nullptr; + APInt OpId = parseTemplateArg(FE, Rule.I.TmplArgNo, Ty, FT->getContext()); + + switch (OpId.getSExtValue()) { + case 0x0: + Suff = ".add"; + break; + case 0x1: + Suff = ".sub"; + break; + case 0x2: + Suff = ".inc"; + break; + case 0x3: + Suff = ".dec"; + break; + case 0x4: + Suff = ".min"; + break; + case 0x5: + Suff = ".max"; + break; + case 0x6: + Suff = ".xchg"; + break; + case 0x7: + Suff = ".cmpxchg"; + break; + case 0x8: + Suff = ".and"; + break; + case 0x9: + Suff = ".or"; + break; + case 0xa: + Suff = ".xor"; + break; + case 0xb: + Suff = ".minsint"; + break; + case 0xc: + Suff = ".maxsint"; + break; + case 0x10: + Suff = ".fmax"; + break; + case 0x11: + Suff = ".fmin"; + break; + case 0x12: + Suff = ".fcmpwr"; + break; + case 0xff: + Suff = ".predec"; + break; + default: + llvm_unreachable("unknown atomic OP"); + }; + break; + } + case ESIMDIntrinDesc::GenXSuffixRuleKind::NUM_KIND: { + // e.g. "f" + int No = Rule.I.CallArgNo; + Type *Ty = No == -1 ? FT->getReturnType() : FT->getParamType(No); + if (Ty->isVectorTy()) + Ty = cast(Ty)->getElementType(); + assert(Ty->isFloatingPointTy() || Ty->isIntegerTy()); + Suff = Ty->isFloatingPointTy() ? "f" : "i"; + break; + } + default: + // It's ok if there is no suffix. + break; + } + + return Suff; +} + +// Turn a MDNode into llvm::value or its subclass. +// Return nullptr if the underlying value has type mismatch. +template Ty *getVal(llvm::Metadata *M) { + if (auto VM = dyn_cast(M)) + if (auto V = dyn_cast(VM->getValue())) + return V; + return nullptr; +} + +/// Return the MDNode that has the SLM size attribute. +static llvm::MDNode *getSLMSizeMDNode(llvm::Function *F) { + llvm::NamedMDNode *Nodes = + F->getParent()->getNamedMetadata(GENX_KERNEL_METADATA); + for (auto Node : Nodes->operands()) { + if (Node->getNumOperands() >= 4 && getVal(Node->getOperand(0)) == F) + return Node; + } + // if F is not a kernel, keep looking into its callers + while (!F->use_empty()) { + auto CI = cast(F->use_begin()->getUser()); + auto UF = CI->getParent()->getParent(); + if (auto Node = getSLMSizeMDNode(UF)) + return Node; + } + return nullptr; +} + +static inline llvm::Metadata *getMD(llvm::Value *V) { + return llvm::ValueAsMetadata::get(V); +} + +static void translateSLMInit(CallInst &CI) { + auto F = CI.getParent()->getParent(); + + auto *ArgV = CI.getArgOperand(0); + if (!isa(ArgV)) { + assert(false && "integral constant expected for slm size"); + return; + } + auto NewVal = cast(ArgV)->getZExtValue(); + assert(NewVal != 0 && "zero slm bytes being requested"); + + // find the corresponding kernel metadata and set the SLM size. + if (llvm::MDNode *Node = getSLMSizeMDNode(F)) { + if (llvm::Value *OldSz = getVal(Node->getOperand(4))) { + assert(isa(OldSz) && "integer constant expected"); + llvm::Value *NewSz = llvm::ConstantInt::get(OldSz->getType(), NewVal); + uint64_t OldVal = cast(OldSz)->getZExtValue(); + if (OldVal < NewVal) + Node->replaceOperandWith(3, getMD(NewSz)); + } + } else { + // We check whether this call is inside a kernel function. + assert(false && "slm_init shall be called by a kernel"); + } +} + +static void translatePackMask(CallInst &CI) { + using Demangler = id::ManglingParser; + Function *F = CI.getCalledFunction(); + StringRef MnglName = F->getName(); + Demangler Parser(MnglName.begin(), MnglName.end()); + id::Node *AST = Parser.parse(); + + if (!AST || !Parser.ForwardTemplateRefs.empty()) { + Twine Msg("failed to demangle ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + if (AST->getKind() != id::Node::KFunctionEncoding) { + Twine Msg("bad ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + auto *FE = static_cast(AST); + llvm::LLVMContext &Context = CI.getContext(); + Type *TTy = nullptr; + APInt Val = parseTemplateArg(FE, 0, TTy, Context); + unsigned N = Val.getZExtValue(); + + IRBuilder<> Builder(&CI); + llvm::Value *Trunc = Builder.CreateTrunc( + CI.getArgOperand(0), + llvm::VectorType::get(llvm::Type::getInt1Ty(Context), N)); + llvm::Type *Ty = llvm::Type::getIntNTy(Context, N); + + llvm::Value *BitCast = Builder.CreateBitCast(Trunc, Ty); + llvm::Value *Result = BitCast; + if (N != 32) { + Result = Builder.CreateCast(llvm::Instruction::ZExt, BitCast, + llvm::Type::getInt32Ty(Context)); + } + + Result->setName(CI.getName()); + cast(Result)->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(Result); +} + +static void translateUnPackMask(CallInst &CI) { + using Demangler = id::ManglingParser; + Function *F = CI.getCalledFunction(); + StringRef MnglName = F->getName(); + Demangler Parser(MnglName.begin(), MnglName.end()); + id::Node *AST = Parser.parse(); + + if (!AST || !Parser.ForwardTemplateRefs.empty()) { + Twine Msg("failed to demangle ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + if (AST->getKind() != id::Node::KFunctionEncoding) { + Twine Msg("bad ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + auto *FE = static_cast(AST); + llvm::LLVMContext &Context = CI.getContext(); + Type *TTy = nullptr; + APInt Val = parseTemplateArg(FE, 0, TTy, Context); + unsigned N = Val.getZExtValue(); + // get N x i1 + assert(CI.getNumArgOperands() == 1); + llvm::Value *Arg0 = CI.getArgOperand(0); + unsigned Width = Arg0->getType()->getPrimitiveSizeInBits(); + IRBuilder<> Builder(&CI); + if (Width > N) { + llvm::Type *Ty = llvm::IntegerType::get(Context, N); + Arg0 = Builder.CreateTrunc(Arg0, Ty); + cast(Arg0)->setDebugLoc(CI.getDebugLoc()); + } + assert(Arg0->getType()->getPrimitiveSizeInBits() == N); + Arg0 = Builder.CreateBitCast( + Arg0, llvm::VectorType::get(llvm::Type::getInt1Ty(Context), N)); + + // get N x i16 + llvm::Value *TransCI = Builder.CreateZExt( + Arg0, llvm::VectorType::get(llvm::Type::getInt16Ty(Context), N)); + TransCI->takeName(&CI); + cast(TransCI)->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(TransCI); +} + +static bool translateVLoad(CallInst &CI, SmallPtrSet &GVTS) { + if (GVTS.find(CI.getType()) != GVTS.end()) + return false; + IRBuilder<> Builder(&CI); + auto LI = Builder.CreateLoad(CI.getArgOperand(0), CI.getName()); + LI->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(LI); + return true; +} + +static bool translateVStore(CallInst &CI, SmallPtrSet &GVTS) { + if (GVTS.find(CI.getOperand(1)->getType()) != GVTS.end()) + return false; + IRBuilder<> Builder(&CI); + auto SI = Builder.CreateStore(CI.getArgOperand(1), CI.getArgOperand(0)); + SI->setDebugLoc(CI.getDebugLoc()); + return true; +} + +static void translateGetValue(CallInst &CI) { + auto opnd = CI.getArgOperand(0); + assert(opnd->getType()->isPointerTy()); + IRBuilder<> Builder(&CI); + auto SV = + Builder.CreatePtrToInt(opnd, IntegerType::getInt32Ty(CI.getContext())); + auto *SI = dyn_cast(SV); + SI->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(SI); +} + +// Newly created GenX intrinsic might have different return type than expected. +// This helper function creates cast operation from GenX intrinsic return type +// to currently expected. Returns pointer to created cast instruction if it +// was created, otherwise returns NewI. +static Instruction *addCastInstIfNeeded(Instruction *OldI, Instruction *NewI) { + Type *NITy = NewI->getType(); + Type *OITy = OldI->getType(); + if (OITy != NITy) { + assert( + CastInst::isCastable(OITy, NITy) && + "Cannot add cast instruction while translating ESIMD intrinsic call"); + auto CastOpcode = CastInst::getCastOpcode(NewI, false, OITy, false); + NewI = CastInst::Create(CastOpcode, NewI, OITy, + NewI->getName() + ".cast.ty", OldI); + } + return NewI; +} + +static int getIndexForSuffix(StringRef Suff) { + return llvm::StringSwitch(Suff) + .Case("x", 0) + .Case("y", 1) + .Case("z", 2) + .Default(-1); +} + +// Helper function to convert SPIRV intrinsic into GenX intrinsic, +// that returns vector of coordinates. +// Example: +// %call = call spir_func i64 @_Z23__spirv_WorkgroupSize_xv() +// => +// %call.esimd = tail call <3 x i32> @llvm.genx.local.size.v3i32() +// %wgsize.x = extractelement <3 x i32> %call.esimd, i32 0 +// %wgsize.x.cast.ty = zext i32 %wgsize.x to i64 +static Instruction *generateVectorGenXForSpirv(CallInst &CI, StringRef Suff, + const std::string &IntrinName, + StringRef ValueName) { + std::string IntrName = + std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + IntrinName; + auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName); + LLVMContext &Ctx = CI.getModule()->getContext(); + Type *I32Ty = Type::getInt32Ty(Ctx); + Function *NewFDecl = GenXIntrinsic::getGenXDeclaration( + CI.getModule(), ID, {VectorType::get(I32Ty, 3)}); + Instruction *IntrI = + IntrinsicInst::Create(NewFDecl, {}, CI.getName() + ".esimd", &CI); + int ExtractIndex = getIndexForSuffix(Suff); + assert(ExtractIndex != -1 && "Extract index is invalid."); + Twine ExtractName = ValueName + Suff; + Instruction *ExtrI = ExtractElementInst::Create( + IntrI, ConstantInt::get(I32Ty, ExtractIndex), ExtractName, &CI); + Instruction *CastI = addCastInstIfNeeded(&CI, ExtrI); + return CastI; +} + +// Helper function to convert SPIRV intrinsic into GenX intrinsic, +// that has exact mapping. +// Example: +// %call = call spir_func i64 @_Z21__spirv_WorkgroupId_xv() +// => +// %group.id.x = tail call i32 @llvm.genx.group.id.x() +// %group.id.x.cast.ty = zext i32 %group.id.x to i64 +static Instruction *generateGenXForSpirv(CallInst &CI, StringRef Suff, + const std::string &IntrinName) { + std::string IntrName = std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + + IntrinName + Suff.str(); + auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName); + Function *NewFDecl = + GenXIntrinsic::getGenXDeclaration(CI.getModule(), ID, {}); + Instruction *IntrI = + IntrinsicInst::Create(NewFDecl, {}, IntrinName + Suff.str(), &CI); + Instruction *CastI = addCastInstIfNeeded(&CI, IntrI); + return CastI; +} + +// This function translates SPIRV intrinsic into GenX intrinsic. +// TODO: Currently, we do not support mixing SYCL and ESIMD kernels. +// Later for ESIMD and SYCL kernels to coexist, we likely need to +// clone call graph that lead from ESIMD kernel to SPIRV intrinsic and +// translate SPIRV intrinsics to GenX intrinsics only in cloned subgraph. +static void +translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName, + SmallVector &ESIMDToErases) { + auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases, + CI](StringRef SpvIName, auto TranslateFunc) { + if (SpirvIntrName.startswith(SpvIName)) { + Value *TranslatedV = + TranslateFunc(*CI, SpirvIntrName.substr(SpvIName.size() + 1, 1)); + CI->replaceAllUsesWith(TranslatedV); + ESIMDToErases.push_back(CI); + } + }; + + translateSpirvIntr("WorkgroupSize", [](CallInst &CI, StringRef Suff) { + return generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); + }); + translateSpirvIntr("LocalInvocationId", [](CallInst &CI, StringRef Suff) { + return generateVectorGenXForSpirv(CI, Suff, "local.id.v3i32", "local_id."); + }); + translateSpirvIntr("WorkgroupId", [](CallInst &CI, StringRef Suff) { + return generateGenXForSpirv(CI, Suff, "group.id."); + }); + translateSpirvIntr("GlobalInvocationId", [](CallInst &CI, StringRef Suff) { + // GlobalId = LocalId + WorkGroupSize * GroupId + Instruction *LocalIdI = + generateVectorGenXForSpirv(CI, Suff, "local.id.v3i32", "local_id."); + Instruction *WGSizeI = + generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); + Instruction *GroupIdI = generateGenXForSpirv(CI, Suff, "group.id."); + Instruction *MulI = + BinaryOperator::CreateMul(WGSizeI, GroupIdI, "mul", &CI); + return BinaryOperator::CreateAdd(LocalIdI, MulI, "add", &CI); + }); + translateSpirvIntr("GlobalSize", [](CallInst &CI, StringRef Suff) { + // GlobalSize = WorkGroupSize * NumWorkGroups + Instruction *WGSizeI = + generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); + Instruction *NumWGI = generateVectorGenXForSpirv( + CI, Suff, "group.count.v3i32", "group_count."); + return BinaryOperator::CreateMul(WGSizeI, NumWGI, "mul", &CI); + }); + // TODO: Support GlobalOffset SPIRV intrinsics + translateSpirvIntr("GlobalOffset", [](CallInst &CI, StringRef Suff) { + return llvm::Constant::getNullValue(CI.getType()); + }); + translateSpirvIntr("NumWorkgroups", [](CallInst &CI, StringRef Suff) { + return generateVectorGenXForSpirv(CI, Suff, "group.count.v3i32", + "group_count."); + }); +} + +static void createESIMDIntrinsicArgs(const ESIMDIntrinDesc &Desc, + SmallVector &GenXArgs, + CallInst &CI, id::FunctionEncoding *FE) { + uint32_t LastCppArgNo = 0; // to implement SRC_CALL_ALL + + for (unsigned int I = 0; I < Desc.ArgRules.size(); ++I) { + const ESIMDIntrinDesc::ArgRule &Rule = Desc.ArgRules[I]; + + switch (Rule.Kind) { + case ESIMDIntrinDesc::GenXArgRuleKind::SRC_CALL_ARG: { + Value *Arg = CI.getArgOperand(Rule.I.Arg.CallArgNo); + + switch (Rule.I.Arg.Conv) { + case ESIMDIntrinDesc::GenXArgConversion::NONE: + GenXArgs.push_back(Arg); + break; + case ESIMDIntrinDesc::GenXArgConversion::TO_I1: { + // convert N-bit integer to 1-bit integer + Type *NTy = Arg->getType(); + assert(NTy->isIntOrIntVectorTy()); + Value *Zero = ConstantInt::get(NTy, 0); + IRBuilder<> Bld(&CI); + auto *Cmp = Bld.CreateICmp(ICmpInst::ICMP_NE, Arg, Zero); + GenXArgs.push_back(Cmp); + break; + } + case ESIMDIntrinDesc::GenXArgConversion::TO_SI: { + // convert a pointer to 32-bit integer surface index + assert(Arg->getType()->isPointerTy()); + IRBuilder<> Bld(&CI); + Value *Res = + Bld.CreatePtrToInt(Arg, IntegerType::getInt32Ty(CI.getContext())); + GenXArgs.push_back(Res); + break; + } + default: + llvm_unreachable("Unknown ESIMD arg conversion"); + } + LastCppArgNo = Rule.I.Arg.CallArgNo; + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::SRC_CALL_ALL: + assert(LastCppArgNo < CI.getNumArgOperands()); + for (uint32_t N = LastCppArgNo; N < CI.getNumArgOperands(); ++N) + GenXArgs.push_back(CI.getArgOperand(N)); + break; + case ESIMDIntrinDesc::GenXArgRuleKind::SRC_TMPL_ARG: { + Type *Ty = nullptr; + APInt Val = parseTemplateArg(FE, Rule.I.TmplArgNo, Ty, CI.getContext()); + Value *ArgVal = ConstantInt::get( + Ty, static_cast(Val.getSExtValue()), true /*signed*/); + GenXArgs.push_back(ArgVal); + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::NUM_BYTES: { + Type *Ty = Rule.I.Arg.CallArgNo == -1 + ? CI.getType() + : CI.getArgOperand(Rule.I.Arg.CallArgNo)->getType(); + assert(Ty->isVectorTy()); + int NBits = + cast(Ty)->getElementType()->getPrimitiveSizeInBits(); + assert(NBits == 8 || NBits == 16 || NBits == 32); + int NWords = NBits / 16; + GenXArgs.push_back( + ConstantInt::get(IntegerType::getInt32Ty(CI.getContext()), NWords)); + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::UNDEF: { + Type *Ty = Rule.I.Arg.CallArgNo == -1 + ? CI.getType() + : CI.getArgOperand(Rule.I.Arg.CallArgNo)->getType(); + GenXArgs.push_back(UndefValue::get(Ty)); + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::CONST_INT16: { + auto Ty = IntegerType::getInt16Ty(CI.getContext()); + GenXArgs.push_back(llvm::ConstantInt::get(Ty, Rule.I.ArgConst)); + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::CONST_INT32: { + auto Ty = IntegerType::getInt32Ty(CI.getContext()); + GenXArgs.push_back(llvm::ConstantInt::get(Ty, Rule.I.ArgConst)); + break; + } + case ESIMDIntrinDesc::GenXArgRuleKind::CONST_INT64: { + auto Ty = IntegerType::getInt64Ty(CI.getContext()); + GenXArgs.push_back(llvm::ConstantInt::get(Ty, Rule.I.ArgConst)); + break; + } + default: + llvm_unreachable_internal("unknown argument rule kind"); + } + } +} + +// Demangles and translates given ESIMD intrinsic call instruction. Example +// +// ### Source-level intrinsic: +// +// sycl::intel::gpu::__vector_type::type __esimd_flat_read( +// sycl::intel::gpu::__vector_type::type, +// sycl::intel::gpu::__vector_type::type) +// +// ### Itanium-mangled name: +// +// _Z14__esimd_flat_readIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeENS2_IyXT0_EE4typeES5_ +// +// ### Itanium demangler IR: +// +// FunctionEncoding( +// NestedName( +// NameWithTemplateArgs( +// NestedName( +// NestedName( +// NameType("cm"), +// NameType("gen")), +// NameType("__vector_type")), +// TemplateArgs( +// {NameType("int"), +// IntegerLiteral("", "16")})), +// NameType("type")), +// NameWithTemplateArgs( +// NameType("__esimd_flat_read"), +// TemplateArgs( +// {NameType("int"), +// IntegerLiteral("", "16")})), +// {NestedName( +// NameWithTemplateArgs( +// NestedName( +// NestedName( +// NameType("cm"), +// NameType("gen")), +// NameType("__vector_type")), +// TemplateArgs( +// {NameType("unsigned long long"), +// IntegerLiteral("", "16")})), +// NameType("type")), +// NestedName( +// NameWithTemplateArgs( +// NestedName( +// NestedName( +// NameType("cm"), +// NameType("gen")), +// NameType("__vector_type")), +// TemplateArgs( +// {NameType("int"), +// IntegerLiteral("", "16")})), +// NameType("type"))}, +// , +// QualNone, FunctionRefQual::FrefQualNone) +// +static void translateESIMDIntrinsicCall(CallInst &CI) { + using Demangler = id::ManglingParser; + Function *F = CI.getCalledFunction(); + StringRef MnglName = F->getName(); + const char *MnglNameCStr = MnglName.data(); + Demangler Parser(MnglNameCStr, MnglNameCStr + std::strlen(MnglNameCStr)); + id::Node *AST = Parser.parse(); + + if (!AST || !Parser.ForwardTemplateRefs.empty()) { + Twine Msg("failed to demangle ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + if (AST->getKind() != id::Node::KFunctionEncoding) { + Twine Msg("bad ESIMD intrinsic: " + MnglName); + llvm::report_fatal_error(Msg, false /*no crash diag*/); + } + auto *FE = static_cast(AST); + id::StringView BaseNameV = FE->getName()->getBaseName(); + + auto PrefLen = StringRef(ESIMD_INTRIN_PREF1).size(); + StringRef BaseName(BaseNameV.begin() + PrefLen, BaseNameV.size() - PrefLen); + const auto &Desc = getIntrinDesc(BaseName); + if (!Desc.isValid()) // TODO remove this once all intrinsics are supported + return; + + auto *FTy = CI.getCalledFunction()->getFunctionType(); + std::string Suffix = getESIMDIntrinSuffix(FE, FTy, Desc.SuffixRule); + auto ID = GenXIntrinsic::lookupGenXIntrinsicID( + GenXIntrinsic::getGenXIntrinsicPrefix() + Desc.GenXSpelling + Suffix); + + SmallVector GenXArgs; + createESIMDIntrinsicArgs(Desc, GenXArgs, CI, FE); + + SmallVector GenXOverloadedTypes; + if (GenXIntrinsic::isOverloadedRet(ID)) + GenXOverloadedTypes.push_back(CI.getType()); + for (unsigned i = 0; i < GenXArgs.size(); ++i) + if (GenXIntrinsic::isOverloadedArg(ID, i)) + GenXOverloadedTypes.push_back(GenXArgs[i]->getType()); + + Function *NewFDecl = GenXIntrinsic::getGenXDeclaration(CI.getModule(), ID, + GenXOverloadedTypes); + + Instruction *NewCI = IntrinsicInst::Create( + NewFDecl, GenXArgs, + NewFDecl->getReturnType()->isVoidTy() ? "" : CI.getName() + ".esimd", + &CI); + NewCI = addCastInstIfNeeded(&CI, NewCI); + CI.replaceAllUsesWith(NewCI); + CI.eraseFromParent(); +} + +static std::string getMDString(MDNode *N, unsigned I) { + if (!N) + return ""; + + Metadata *Op = N->getOperand(I); + if (!Op) + return ""; + + if (MDString *Str = dyn_cast(Op)) { + return Str->getString().str(); + } + + return ""; +} + +void SYCLLowerESIMDLegacyPass::generateKernelMetadata(Module &M) { + if (M.getNamedMetadata(GENX_KERNEL_METADATA)) + return; + + auto Kernels = M.getOrInsertNamedMetadata(GENX_KERNEL_METADATA); + assert(Kernels->getNumOperands() == 0 && "metadata out of sync"); + + LLVMContext &Ctx = M.getContext(); + Type *I32Ty = Type::getInt32Ty(Ctx); + + std::string TargetTriple = M.getTargetTriple(); + llvm::Triple T(TargetTriple); + T.setArchName("genx64"); + TargetTriple = T.str(); + M.setTargetTriple(TargetTriple); + + enum { AK_NORMAL, AK_SAMPLER, AK_SURFACE, AK_VME }; + enum { IK_NORMAL, IK_INPUT, IK_OUTPUT, IK_INPUT_OUTPUT }; + + for (auto &F : M.functions()) { + // Skip non-SIMD kernels. + if (F.getCallingConv() != CallingConv::SPIR_KERNEL || + F.getMetadata("sycl_explicit_simd") == nullptr) + continue; + + // Metadata node containing N i32s, where N is the number of kernel + // arguments, and each i32 is the kind of argument, one of: + // 0 = general, 1 = sampler, 2 = surface, 3 = vme + // (the same values as in the "kind" field of an "input_info" record in a + // vISA kernel. + SmallVector ArgKinds; + + // Optional, not supported for compute + SmallVector ArgInOutKinds; + + // Metadata node describing N strings where N is the number of kernel + // arguments, each string describing argument type in OpenCL. + // required for running on top of OpenCL runtime. + SmallVector ArgTypeDescs; + + auto *KernelArgTypes = F.getMetadata("kernel_arg_type"); + unsigned Idx = 0; + + // Iterate argument list to gather argument kinds and generate argument + // descriptors. + for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) { + Argument &Arg = *AI; + + int Kind = AK_NORMAL; + int IKind = IK_NORMAL; + + auto ArgType = getMDString(KernelArgTypes, Idx); + + if (ArgType.find("image1d_t") != std::string::npos || + ArgType.find("image2d_t") != std::string::npos || + ArgType.find("image3d_t") != std::string::npos || + ArgType.find("image1d_buffer_t") != std::string::npos) { + Kind = AK_SURFACE; + ArgTypeDescs.push_back(MDString::get(Ctx, ArgType)); + } else { + StringRef ArgDesc = ""; + if (Arg.getType()->isPointerTy()) + ArgDesc = "svmptr_t"; + ArgTypeDescs.push_back(MDString::get(Ctx, ArgDesc)); + } + + ArgKinds.push_back(getMD(ConstantInt::get(I32Ty, Kind))); + ArgInOutKinds.push_back(getMD(ConstantInt::get(I32Ty, IKind))); + + Idx++; + } + + MDNode *Kinds = MDNode::get(Ctx, ArgKinds); + MDNode *IOKinds = MDNode::get(Ctx, ArgInOutKinds); + MDNode *ArgDescs = MDNode::get(Ctx, ArgTypeDescs); + + Metadata *MDArgs[] = { + getMD(&F), + MDString::get(Ctx, F.getName().str()), + Kinds, + getMD(llvm::ConstantInt::getNullValue(I32Ty)), // SLM size in bytes + getMD(llvm::ConstantInt::getNullValue(I32Ty)), // arg offsets + IOKinds, + ArgDescs}; + + // Add this kernel to the root. + Kernels->addOperand(MDNode::get(Ctx, MDArgs)); + F.addFnAttr("oclrt", "1"); + // F.setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass); + F.addFnAttr("CMGenxMain"); + } +} + +// collect all the vector-types that are used by genx-volatiles +void SYCLLowerESIMDLegacyPass::collectGenXVolatileType(Module &M) { + for (auto &G : M.getGlobalList()) { + if (!G.hasAttribute("genx_volatile")) + continue; + auto PTy = dyn_cast(G.getType()); + if (!PTy) + continue; + auto GTy = dyn_cast(PTy->getPointerElementType()); + if (!GTy || !GTy->getName().endswith("cl::sycl::intel::gpu::simd")) + continue; + assert(GTy->getNumContainedTypes() == 1); + auto VTy = GTy->getContainedType(0); + assert(VTy->isVectorTy()); + GenXVolatileTypeSet.insert(VTy); + } +} + +} // namespace + +PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, + FunctionAnalysisManager &FAM, + SmallPtrSet &GVTS) { + // Only consider functions marked with !sycl_explicit_simd + if (F.getMetadata("sycl_explicit_simd") == nullptr) + return PreservedAnalyses::all(); + + SmallVector ESIMDIntrCalls; + SmallVector ESIMDToErases; + + for (Instruction &I : instructions(F)) { + if (auto CastOp = dyn_cast(&I)) { + llvm::Type *DstTy = CastOp->getDestTy(); + auto CastOpcode = CastOp->getOpcode(); + if ((CastOpcode == llvm::Instruction::FPToUI && + DstTy->getScalarType()->getPrimitiveSizeInBits() <= 32) || + (CastOpcode == llvm::Instruction::FPToSI && + DstTy->getScalarType()->getPrimitiveSizeInBits() < 32)) { + IRBuilder<> Builder(&I); + llvm::Value *Src = CastOp->getOperand(0); + auto TmpTy = + llvm::VectorType::get(llvm::Type::getInt32Ty(DstTy->getContext()), + cast(DstTy)->getNumElements()); + Src = Builder.CreateFPToSI(Src, TmpTy); + + llvm::Instruction::CastOps TruncOp = llvm::Instruction::Trunc; + llvm::Value *NewDst = Builder.CreateCast(TruncOp, Src, DstTy); + CastOp->replaceAllUsesWith(NewDst); + ESIMDToErases.push_back(CastOp); + } + } + + auto *CI = dyn_cast(&I); + Function *Callee = nullptr; + if (!CI || CI->isIndirectCall() || !(Callee = CI->getCalledFunction())) + continue; + StringRef Name = Callee->getName(); + + // See if the Name represents an ESIMD intrinsic and demangle only if it + // does. + if (!Name.startswith(ESIMD_INTRIN_PREF0)) + continue; + // now skip the digits + StringRef Name1 = Name.substr(std::strlen(ESIMD_INTRIN_PREF0)); + Name1 = Name1.drop_while([](char C) { return std::isdigit(C); }); + + // process ESIMD builtins that go through special handling instead of + // the translation procedure + if (Name1.startswith("cl4sycl5intel3gpu8slm_init")) { + // tag the kernel with meta-data SLMSize, and remove this builtin + translateSLMInit(*CI); + ESIMDToErases.push_back(CI); + continue; + } + if (Name1.startswith("__esimd_pack_mask")) { + translatePackMask(*CI); + ESIMDToErases.push_back(CI); + continue; + } + if (Name1.startswith("__esimd_unpack_mask")) { + translateUnPackMask(*CI); + ESIMDToErases.push_back(CI); + continue; + } + // If vload/vstore is not about the vector-types used by + // those globals marked as genx_volatile, We can translate + // them directly into generic load/store inst. In this way + // those insts can be optimized by llvm ASAP. + if (Name1.startswith("__esimd_vload")) { + if (translateVLoad(*CI, GVTS)) { + ESIMDToErases.push_back(CI); + continue; + } + } + if (Name1.startswith("__esimd_vstore")) { + if (translateVStore(*CI, GVTS)) { + ESIMDToErases.push_back(CI); + continue; + } + } + + if (Name1.startswith("__esimd_get_value")) { + translateGetValue(*CI); + ESIMDToErases.push_back(CI); + continue; + } + + if (Name1.startswith(SPIRV_INTRIN_PREF)) { + auto SpirvPrefLen = StringRef(SPIRV_INTRIN_PREF).size(); + StringRef SpirvIntrName = Name1.substr(SpirvPrefLen); + translateSpirvIntrinsic(CI, SpirvIntrName, ESIMDToErases); + // For now: if no match, just let it go untranslated. + continue; + } + + if ((Name1.size() == 0) || !Name1.startswith(ESIMD_INTRIN_PREF1)) + continue; + // this is ESIMD intrinsic - record for later translation + ESIMDIntrCalls.push_back(CI); + } + // Now demangle and translate found ESIMD intrinsic calls + for (auto *CI : ESIMDIntrCalls) { + translateESIMDIntrinsicCall(*CI); + } + for (auto *CI : ESIMDToErases) { + CI->eraseFromParent(); + } + + // TODO FIXME ESIMD figure out less conservative result + return ESIMDIntrCalls.size() > 0 ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll new file mode 100644 index 0000000000000..e257a64df1699 --- /dev/null +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -0,0 +1,198 @@ +; This test checks C++ ESIMD intrinsics lowering to "@llvm.genx.*" form +; consumable by the CM back-end. +; +; RUN: opt < %s -LowerESIMD -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-windows-sycldevice" + +%opencl.image2d_ro_t = type opaque +%opencl.image2d_wo_t = type opaque + +%"cm::gen::simd" = type { <16 x i32> } + +@vg = dso_local global %"cm::gen::simd" zeroinitializer, align 64 #0 +@vc = dso_local addrspace(1) global <32 x i32> zeroinitializer + +define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i64> + %1 = load <32 x i64>, <32 x i64>* %a_1 + %a_2 = alloca <32 x i16> + %2 = load <32 x i16>, <32 x i16>* %a_2 + %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i16> %2) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + ret <32 x i32> %ret_val +} + +define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i64> + %1 = load <32 x i64>, <32 x i64>* %a_1 + %a_2 = alloca <32 x i32> + %2 = load <32 x i32>, <32 x i32>* %a_2 + %a_3 = alloca <32 x i16> + %3 = load <32 x i16>, <32 x i16>* %a_3 + %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i32> %2, <32 x i16> %3) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + ret <32 x i32> %ret_val +} + +define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i64> + %1 = load <32 x i64>, <32 x i64>* %a_1 + %a_2 = alloca <32 x i32> + %2 = load <32 x i32>, <32 x i32>* %a_2 + %a_3 = alloca <32 x i32> + %3 = load <32 x i32>, <32 x i32>* %a_3 + %a_4 = alloca <32 x i16> + %4 = load <32 x i16>, <32 x i16>* %a_4 + %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i32> %2, <32 x i32> %3, <32 x i16> %4) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + ret <32 x i32> %ret_val +} + +define dso_local spir_func <32 x i32> @FUNC_4() !sycl_explicit_simd !1 { + %ret_val = call spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 0) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32(i64 0) + ret <32 x i32> %ret_val +} + +define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i32> + %1 = load <32 x i32>, <32 x i32>* %a_1 + call spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 0, <32 x i32> %1) +; CHECK: call void @llvm.genx.svm.block.st.v32i32(i64 0, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + ret void +} + +define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i64> + %1 = load <32 x i64>, <32 x i64>* %a_1 + %a_2 = alloca <32 x i16> + %2 = load <32 x i16>, <32 x i16>* %a_2 + %ret_val = call spir_func <32 x i32> @_Z17__esimd_flat_readIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeENS3_IyXT0_EE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %1, i32 0, <32 x i16> %2) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + ret <32 x i32> %ret_val +} + +define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i64> + %1 = load <32 x i64>, <32 x i64>* %a_1 + %a_2 = alloca <32 x i32> + %2 = load <32 x i32>, <32 x i32>* %a_2 + %a_3 = alloca <32 x i16> + %3 = load <32 x i16>, <32 x i16>* %a_3 + call spir_func void @_Z18__esimd_flat_writeIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EEvNS1_13__vector_typeIyXT0_EE4typeENS3_IT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %1, <32 x i32> %2, i32 0, <32 x i16> %3) +; CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + ret void +} + +define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 { + %a_1 = alloca <16 x i16> + %1 = load <16 x i16>, <16 x i16>* %a_1 + %a_2 = alloca <16 x i16> + %2 = load <16 x i16>, <16 x i16>* %a_2 + %ret_val = call spir_func <16 x i16> @_Z12__esimd_sminIsLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_S5_(<16 x i16> %1, <16 x i16> %2) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i16> @llvm.genx.smin.v16i16.v16i16(<16 x i16> %{{[0-9a-zA-Z_.]+}}, <16 x i16> %{{[0-9a-zA-Z_.]+}}) + ret <16 x i16> %ret_val +} + +define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 { + %a_1 = alloca <1 x float> + %1 = load <1 x float>, <1 x float>* %a_1 + %a_2 = alloca <1 x float> + %2 = load <1 x float>, <1 x float>* %a_2 + %ret_val = call spir_func <1 x float> @_Z16__esimd_div_ieeeILi1EEN2cm3gen13__vector_typeIfXT_EE4typeES4_S4_(<1 x float> %1, <1 x float> %2) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}}) + ret <1 x float> %ret_val +} + +define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 { + %a_1 = alloca <16 x float> + %1 = load <16 x float>, <16 x float>* %a_1 + %ret_val = call spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %1, i16 zeroext 0) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x float> @llvm.genx.rdregionf.v8f32.v16f32.i16(<16 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0) + ret <8 x float> %ret_val +} + +define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 { + %a_1 = alloca <16 x float> + %1 = load <16 x float>, <16 x float>* %a_1 + %a_2 = alloca <8 x float> + %2 = load <8 x float>, <8 x float>* %a_2 + %ret_val = call spir_func <16 x float> @_Z16__esimd_wrregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_NS2_IS3_XT1_EE4typeEtNS2_ItXT1_EE4typeE(<16 x float> %1, <8 x float> %2, i16 zeroext 0, <8 x i16> ) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x float> @llvm.genx.wrregionf.v16f32.v8f32.i16.v8i1(<16 x float> %{{[0-9a-zA-Z_.]+}}, <8 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0, <8 x i1> ) + ret <16 x float> %ret_val +} + +define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { + %ret_val = call spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 0, %opencl.image2d_ro_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2) +; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + ret <32 x i32> %ret_val +} + +define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 { + %a_3 = alloca <32 x i32> + %4 = load <32 x i32>, <32 x i32>* %a_3 + call spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 0, %opencl.image2d_wo_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2, <32 x i32> %4) +; CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + ret void +} + +define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 { + %ret_val = call spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*)) +; CHECK: %ret_val1 = load <16 x i32>, <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*), align 64 +; TODO: testcase to generate this: +; CxHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* {{.*}}) + ret <16 x i32> %ret_val +} + +define dso_local spir_func void @FUNC_28(<32 x i32> %0) !sycl_explicit_simd !1 { + call spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), <32 x i32> %0) +; CHECK: store <32 x i32> %0, <32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), align 128 + + ret void +} + +define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { + %a_1 = alloca <32 x i32> + %1 = addrspacecast <32 x i32>* %a_1 to <32 x i32> addrspace(4)* + %a_2 = alloca <32 x i32> + %2 = load <32 x i32>, <32 x i32>* %a_2 + call spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* %1, <32 x i32> %2) +; CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}} + ret void +} + +declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1) +declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2) +declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3) +declare dso_local spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 %0) +declare dso_local spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 %0, <32 x i32> %1) +declare dso_local spir_func <32 x i32> @_Z17__esimd_flat_readIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeENS3_IyXT0_EE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %0, i32 %1, <32 x i16> %2) +declare dso_local spir_func void @_Z18__esimd_flat_writeIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EEvNS1_13__vector_typeIyXT0_EE4typeENS3_IT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %0, <32 x i32> %1, i32 %2, <32 x i16> %3) +declare dso_local spir_func <16 x i16> @_Z12__esimd_sminIsLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_S5_(<16 x i16> %0, <16 x i16> %1) +declare dso_local spir_func <1 x float> @_Z16__esimd_div_ieeeILi1EEN2cm3gen13__vector_typeIfXT_EE4typeES4_S4_(<1 x float> %0, <1 x float> %1) +declare dso_local spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %0, i16 zeroext %1) +declare dso_local spir_func <16 x float> @_Z16__esimd_wrregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_NS2_IS3_XT1_EE4typeEtNS2_ItXT1_EE4typeE(<16 x float> %0, <8 x float> %1, i16 zeroext %2, <8 x i16> %3) +declare dso_local spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* %0, <32 x i32> %1) +declare dso_local spir_func void @_Z14__esimd_vstoreIyLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i64> addrspace(4)* %0, <32 x i64> %1) +declare dso_local spir_func <32 x i64> @_Z13__esimd_vloadIyLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i64> addrspace(4)* %0) +declare dso_local spir_func <32 x i16> @_Z13__esimd_vloadItLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i16> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIjLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* %0, <32 x i32> %1) +declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIjLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0) +declare dso_local spir_func <16 x i16> @_Z13__esimd_vloadIsLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i16> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIsLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x i16> addrspace(4)* %0, <16 x i16> %1) +declare dso_local spir_func <1 x float> @_Z13__esimd_vloadIfLi1EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<1 x float> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIfLi1EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<1 x float> addrspace(4)* %0, <1 x float> %1) +declare dso_local spir_func <16 x float> @_Z13__esimd_vloadIfLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x float> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIfLi8EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<8 x float> addrspace(4)* %0, <8 x float> %1) +declare dso_local spir_func <8 x float> @_Z13__esimd_vloadIfLi8EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<8 x float> addrspace(4)* %0) +declare dso_local spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 %0, %opencl.image2d_ro_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5) +declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 %0, %opencl.image2d_wo_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5, <32 x i32> %6) +declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0) +declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1) + +attributes #0 = { "genx_byte_offset"="192" "genx_volatile" } + +!1 = !{} diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index a1bd3a68facad..e9b744e13a795 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -590,6 +590,7 @@ int main(int argc, char **argv) { initializeHardwareLoopsPass(Registry); initializeTypePromotionPass(Registry); initializeSYCLLowerWGScopeLegacyPassPass(Registry); + initializeSYCLLowerESIMDLegacyPassPass(Registry); #ifdef BUILD_EXAMPLES initializeExampleIRTransforms(Registry); From ba89a499f40f87ad3bbe9cd7ae6ffa0c2b906cff Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Wed, 17 Jun 2020 15:53:27 -0700 Subject: [PATCH 3/7] [SQUASH] Address review comments. Signed-off-by: Konstantin S Bobrovsky --- llvm/include/llvm/SYCLLowerIR/LowerESIMD.h | 2 +- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 5 +---- llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h index 2141b65c5895e..221241616fb07 100644 --- a/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h +++ b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h @@ -1,4 +1,4 @@ -//===-- LowerESIMD.cpp - lower Explicit SIMD (ESIMD) constructs -----------===// +//===---- LowerESIMD.h - lower Explicit SIMD (ESIMD) constructs -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index c9e760359a83f..7e4ac4c28141d 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -1134,9 +1134,7 @@ void SYCLLowerESIMDLegacyPass::generateKernelMetadata(Module &M) { // Iterate argument list to gather argument kinds and generate argument // descriptors. - for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) { - Argument &Arg = *AI; - + for (const Argument &Arg : F.args()) { int Kind = AK_NORMAL; int IKind = IK_NORMAL; @@ -1177,7 +1175,6 @@ void SYCLLowerESIMDLegacyPass::generateKernelMetadata(Module &M) { // Add this kernel to the root. Kernels->addOperand(MDNode::get(Ctx, MDArgs)); F.addFnAttr("oclrt", "1"); - // F.setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass); F.addFnAttr("CMGenxMain"); } } diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index e257a64df1699..ee8f030c7097f 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -4,7 +4,7 @@ ; RUN: opt < %s -LowerESIMD -S | FileCheck %s target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" -target triple = "spir64-unknown-windows-sycldevice" +target triple = "spir64-unknown-unknown-sycldevice" %opencl.image2d_ro_t = type opaque %opencl.image2d_wo_t = type opaque From 32584d865699edb77431134fd280b8f6a8a75a5b Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Fri, 19 Jun 2020 15:40:06 -0700 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Alexey Sachkov --- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 7e4ac4c28141d..ffcc829ea46d2 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -440,7 +440,6 @@ Type *parsePrimitiveTypeString(StringRef TyStr, LLVMContext &Ctx) { .Case("float", IntegerType::getFloatTy(Ctx)) .Case("double", IntegerType::getDoubleTy(Ctx)) .Case("void", IntegerType::getVoidTy(Ctx)) - .Case("", nullptr) .Default(nullptr); } @@ -825,9 +824,9 @@ translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName, SmallVector &ESIMDToErases) { auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases, CI](StringRef SpvIName, auto TranslateFunc) { - if (SpirvIntrName.startswith(SpvIName)) { + if (SpirvIntrName.consume_front(SpvIName)) { Value *TranslatedV = - TranslateFunc(*CI, SpirvIntrName.substr(SpvIName.size() + 1, 1)); + TranslateFunc(*CI, SpirvIntrName.front())); CI->replaceAllUsesWith(TranslatedV); ESIMDToErases.push_back(CI); } @@ -1027,7 +1026,7 @@ static void translateESIMDIntrinsicCall(CallInst &CI) { Function *F = CI.getCalledFunction(); StringRef MnglName = F->getName(); const char *MnglNameCStr = MnglName.data(); - Demangler Parser(MnglNameCStr, MnglNameCStr + std::strlen(MnglNameCStr)); + Demangler Parser(MnglName.begin(), MnglName.end())); id::Node *AST = Parser.parse(); if (!AST || !Parser.ForwardTemplateRefs.empty()) { @@ -1233,17 +1232,16 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, auto *CI = dyn_cast(&I); Function *Callee = nullptr; - if (!CI || CI->isIndirectCall() || !(Callee = CI->getCalledFunction())) + if (!CI || !(Callee = CI->getCalledFunction())) continue; StringRef Name = Callee->getName(); // See if the Name represents an ESIMD intrinsic and demangle only if it // does. - if (!Name.startswith(ESIMD_INTRIN_PREF0)) + if (!Name.consume_front(ESIMD_INTRIN_PREF0)) continue; // now skip the digits - StringRef Name1 = Name.substr(std::strlen(ESIMD_INTRIN_PREF0)); - Name1 = Name1.drop_while([](char C) { return std::isdigit(C); }); + StringRef Name1 = Name1.drop_while([](char C) { return std::isdigit(C); }); // process ESIMD builtins that go through special handling instead of // the translation procedure @@ -1286,15 +1284,13 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, continue; } - if (Name1.startswith(SPIRV_INTRIN_PREF)) { - auto SpirvPrefLen = StringRef(SPIRV_INTRIN_PREF).size(); - StringRef SpirvIntrName = Name1.substr(SpirvPrefLen); - translateSpirvIntrinsic(CI, SpirvIntrName, ESIMDToErases); + if (Name1.consume_front(SPIRV_INTRIN_PREF)) { + translateSpirvIntrinsic(CI, Name1, ESIMDToErases); // For now: if no match, just let it go untranslated. continue; } - if ((Name1.size() == 0) || !Name1.startswith(ESIMD_INTRIN_PREF1)) + if (Name1.empty() || !Name1.startswith(ESIMD_INTRIN_PREF1)) continue; // this is ESIMD intrinsic - record for later translation ESIMDIntrCalls.push_back(CI); From f22c5cd6404f27263d1d5d9c940ed0ff6d665810 Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Mon, 22 Jun 2020 22:27:20 -0700 Subject: [PATCH 5/7] [SQUASH] Fixes after code review comments implementation. Signed-off-by: Konstantin S Bobrovsky --- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 48 +++++++++----------- llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 15 +++++- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index ffcc829ea46d2..5278baa226651 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -33,6 +33,7 @@ #include #include #include +#include using namespace llvm; namespace id = itanium_demangle; @@ -385,12 +386,7 @@ static const ESIMDIntrinDesc &getIntrinDesc(StringRef SrcSpelling) { if (It == Table.end()) { Twine Msg("unknown ESIMD intrinsic: " + SrcSpelling); - - llvm::errs() << Msg << "\n"; - // TODO warning message for now, to enable compiling tests with intrinsics - // that are not implemented yet - // llvm::report_fatal_error(Msg, false/*no crash diag*/); - return InvalidDesc; + llvm::report_fatal_error(Msg, false /*no crash diag*/); } return It->second; } @@ -454,8 +450,8 @@ static const T *castNodeImpl(const id::Node *N, id::Node::Kind K) { static APInt parseTemplateArg(id::FunctionEncoding *FE, unsigned int N, Type *&Ty, LLVMContext &Ctx) { - auto *Nm = castNode(FE->getName(), NameWithTemplateArgs); - auto *ArgsN = castNode(Nm->TemplateArgs, TemplateArgs); + const auto *Nm = castNode(FE->getName(), NameWithTemplateArgs); + const auto *ArgsN = castNode(Nm->TemplateArgs, TemplateArgs); id::NodeArray Args = ArgsN->getParams(); assert(N < Args.size() && "too few template arguments"); id::StringView Val; @@ -652,7 +648,7 @@ static void translatePackMask(CallInst &CI) { IRBuilder<> Builder(&CI); llvm::Value *Trunc = Builder.CreateTrunc( CI.getArgOperand(0), - llvm::VectorType::get(llvm::Type::getInt1Ty(Context), N)); + llvm::FixedVectorType::get(llvm::Type::getInt1Ty(Context), N)); llvm::Type *Ty = llvm::Type::getIntNTy(Context, N); llvm::Value *BitCast = Builder.CreateBitCast(Trunc, Ty); @@ -699,11 +695,11 @@ static void translateUnPackMask(CallInst &CI) { } assert(Arg0->getType()->getPrimitiveSizeInBits() == N); Arg0 = Builder.CreateBitCast( - Arg0, llvm::VectorType::get(llvm::Type::getInt1Ty(Context), N)); + Arg0, llvm::FixedVectorType::get(llvm::Type::getInt1Ty(Context), N)); // get N x i16 llvm::Value *TransCI = Builder.CreateZExt( - Arg0, llvm::VectorType::get(llvm::Type::getInt16Ty(Context), N)); + Arg0, llvm::FixedVectorType::get(llvm::Type::getInt16Ty(Context), N)); TransCI->takeName(&CI); cast(TransCI)->setDebugLoc(CI.getDebugLoc()); CI.replaceAllUsesWith(TransCI); @@ -782,7 +778,7 @@ static Instruction *generateVectorGenXForSpirv(CallInst &CI, StringRef Suff, LLVMContext &Ctx = CI.getModule()->getContext(); Type *I32Ty = Type::getInt32Ty(Ctx); Function *NewFDecl = GenXIntrinsic::getGenXDeclaration( - CI.getModule(), ID, {VectorType::get(I32Ty, 3)}); + CI.getModule(), ID, {FixedVectorType::get(I32Ty, 3)}); Instruction *IntrI = IntrinsicInst::Create(NewFDecl, {}, CI.getName() + ".esimd", &CI); int ExtractIndex = getIndexForSuffix(Suff); @@ -825,8 +821,7 @@ translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName, auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases, CI](StringRef SpvIName, auto TranslateFunc) { if (SpirvIntrName.consume_front(SpvIName)) { - Value *TranslatedV = - TranslateFunc(*CI, SpirvIntrName.front())); + Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName); CI->replaceAllUsesWith(TranslatedV); ESIMDToErases.push_back(CI); } @@ -1025,8 +1020,7 @@ static void translateESIMDIntrinsicCall(CallInst &CI) { using Demangler = id::ManglingParser; Function *F = CI.getCalledFunction(); StringRef MnglName = F->getName(); - const char *MnglNameCStr = MnglName.data(); - Demangler Parser(MnglName.begin(), MnglName.end())); + Demangler Parser(MnglName.begin(), MnglName.end()); id::Node *AST = Parser.parse(); if (!AST || !Parser.ForwardTemplateRefs.empty()) { @@ -1219,7 +1213,7 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, IRBuilder<> Builder(&I); llvm::Value *Src = CastOp->getOperand(0); auto TmpTy = - llvm::VectorType::get(llvm::Type::getInt32Ty(DstTy->getContext()), + llvm::FixedVectorType::get(llvm::Type::getInt32Ty(DstTy->getContext()), cast(DstTy)->getNumElements()); Src = Builder.CreateFPToSI(Src, TmpTy); @@ -1241,22 +1235,22 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, if (!Name.consume_front(ESIMD_INTRIN_PREF0)) continue; // now skip the digits - StringRef Name1 = Name1.drop_while([](char C) { return std::isdigit(C); }); + Name = Name.drop_while([](char C) { return std::isdigit(C); }); // process ESIMD builtins that go through special handling instead of // the translation procedure - if (Name1.startswith("cl4sycl5intel3gpu8slm_init")) { + if (Name.startswith("N2cl4sycl5intel3gpu8slm_init")) { // tag the kernel with meta-data SLMSize, and remove this builtin translateSLMInit(*CI); ESIMDToErases.push_back(CI); continue; } - if (Name1.startswith("__esimd_pack_mask")) { + if (Name.startswith("__esimd_pack_mask")) { translatePackMask(*CI); ESIMDToErases.push_back(CI); continue; } - if (Name1.startswith("__esimd_unpack_mask")) { + if (Name.startswith("__esimd_unpack_mask")) { translateUnPackMask(*CI); ESIMDToErases.push_back(CI); continue; @@ -1265,32 +1259,32 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, // those globals marked as genx_volatile, We can translate // them directly into generic load/store inst. In this way // those insts can be optimized by llvm ASAP. - if (Name1.startswith("__esimd_vload")) { + if (Name.startswith("__esimd_vload")) { if (translateVLoad(*CI, GVTS)) { ESIMDToErases.push_back(CI); continue; } } - if (Name1.startswith("__esimd_vstore")) { + if (Name.startswith("__esimd_vstore")) { if (translateVStore(*CI, GVTS)) { ESIMDToErases.push_back(CI); continue; } } - if (Name1.startswith("__esimd_get_value")) { + if (Name.startswith("__esimd_get_value")) { translateGetValue(*CI); ESIMDToErases.push_back(CI); continue; } - if (Name1.consume_front(SPIRV_INTRIN_PREF)) { - translateSpirvIntrinsic(CI, Name1, ESIMDToErases); + if (Name.consume_front(SPIRV_INTRIN_PREF)) { + translateSpirvIntrinsic(CI, Name, ESIMDToErases); // For now: if no match, just let it go untranslated. continue; } - if (Name1.empty() || !Name1.startswith(ESIMD_INTRIN_PREF1)) + if (Name.empty() || !Name.startswith(ESIMD_INTRIN_PREF1)) continue; // this is ESIMD intrinsic - record for later translation ESIMDIntrCalls.push_back(CI); diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index ee8f030c7097f..f29503b1bfce3 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -163,6 +163,13 @@ define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { ret void } +define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 { +; CHECK: define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 + call spir_func void @_ZN2cl4sycl5intel3gpu8slm_initEj(i32 1023) + ret void +; CHECK-NEXT: ret void +} + declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3) @@ -192,7 +199,13 @@ declare dso_local spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14 declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 %0, %opencl.image2d_wo_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5, <32 x i32> %6) declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0) declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1) +declare dso_local spir_func void @_ZN2cl4sycl5intel3gpu8slm_initEj(i32) attributes #0 = { "genx_byte_offset"="192" "genx_volatile" } -!1 = !{} +!genx.kernels = !{!0} + +!0 = !{void ()* @"FUNC_30", !"FUNC_30", !1, i32 0, i32 0, !1, !2, i32 0, i32 0} +; CHECK: !0 = !{void ()* @FUNC_30, !"FUNC_30", !1, i32 1023, i32 0, !1, !2, i32 0, i32 0} +!1 = !{i32 0, i32 0} +!2 = !{} From 6c0a911403aae1b5664a9f90b71c8994ad997417 Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Tue, 30 Jun 2020 22:42:40 -0700 Subject: [PATCH 6/7] [SYCL][ESIMD] Move vc-intrinsics dependency build to llvm/lib/SYCLLowerIR. SYCLLowerIR is the only user of vc-intrinsics. Signed-off-by: Konstantin S Bobrovsky --- llvm/lib/SYCLLowerIR/CMakeLists.txt | 30 +++++++++++++++++++++++++++++ sycl/CMakeLists.txt | 29 ---------------------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 03b507bafc9ed..a0bff08a5524c 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -1,3 +1,33 @@ +# Lowering of SYCL ESIMD kernels depends on vc-intrinsics +# NOTE: could have been added earlier from llvm/projects +if (NOT TARGET LLVMGenXIntrinsics) + if (NOT DEFINED LLVMGenXIntrinsics_SOURCE_DIR) + message(STATUS "vc-intrinsics are missing. Will try to download them from github.com") + + include(FetchContent) + FetchContent_Declare(vc-intrinsics + GIT_REPOSITORY https://github.com/intel/vc-intrinsics.git + GIT_TAG cce6e48c28eb850d7dadd30841c0d95f009bbca1 + ) + FetchContent_MakeAvailable(vc-intrinsics) + FetchContent_GetProperties(vc-intrinsics) + + set(LLVMGenXIntrinsics_SOURCE_DIR ${vc-intrinsics_SOURCE_DIR}) + set(LLVMGenXIntrinsics_BINARY_DIR ${vc-intrinsics_BINARY_DIR}) + else() + # -DLLVMGenXIntrinsics_SOURCE_DIR is provided + message(STATUS "vc-intrinsics are added manually ${LLVMGenXIntrinsics_SOURCE_DIR}") + + set(LLVMGenXIntrinsics_BINARY_DIR ${CMAKE_BINARY_DIR}/vc-intrinsics-build) + add_subdirectory(${LLVMGenXIntrinsics_SOURCE_DIR} ${LLVMGenXIntrinsics_BINARY_DIR}) + endif() + + target_include_directories(LLVMGenXIntrinsics + PUBLIC $ + PUBLIC $ + ) +endif() + add_llvm_component_library(LLVMSYCLLowerIR LowerWGScope.cpp LowerESIMD.cpp diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index c87834185879d..fb36919c77860 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -184,35 +184,6 @@ install(DIRECTORY ${OPENCL_INCLUDE}/CL COMPONENT opencl-headers ) -# vc-intrinsics could have been added earlier from llvm/projects -if (NOT TARGET LLVMGenXIntrinsics) - if (NOT DEFINED LLVMGenXIntrinsics_SOURCE_DIR) - message(STATUS "vc-intrinsics are missing. Will try to download them from github.com") - - include(FetchContent) - FetchContent_Declare(vc-intrinsics - GIT_REPOSITORY https://github.com/intel/vc-intrinsics.git - GIT_TAG cce6e48c28eb850d7dadd30841c0d95f009bbca1 - ) - FetchContent_MakeAvailable(vc-intrinsics) - FetchContent_GetProperties(vc-intrinsics) - - set(LLVMGenXIntrinsics_SOURCE_DIR ${vc-intrinsics_SOURCE_DIR}) - set(LLVMGenXIntrinsics_BINARY_DIR ${vc-intrinsics_BINARY_DIR}) - else() - # -DLLVMGenXIntrinsics_SOURCE_DIR is provided - message(STATUS "vc-intrinsics are added manually ${LLVMGenXIntrinsics_SOURCE_DIR}") - - set(LLVMGenXIntrinsics_BINARY_DIR ${CMAKE_BINARY_DIR}/vc-intrinsics-build) - add_subdirectory(${LLVMGenXIntrinsics_SOURCE_DIR} ${LLVMGenXIntrinsics_BINARY_DIR}) - endif() - - target_include_directories(LLVMGenXIntrinsics - PUBLIC $ - PUBLIC $ - ) -endif() - option(SYCL_BUILD_PI_CUDA "Enables the CUDA backend for the Plugin Interface" OFF) From 24f099fd33b5a5b5a8c3dbee683ee8c8c2176a9e Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Tue, 30 Jun 2020 23:03:58 -0700 Subject: [PATCH 7/7] [SQUASH] Applied clang-format. Signed-off-by: Konstantin S Bobrovsky --- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 5278baa226651..232706c101d23 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -33,7 +33,6 @@ #include #include #include -#include using namespace llvm; namespace id = itanium_demangle; @@ -1212,9 +1211,9 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, DstTy->getScalarType()->getPrimitiveSizeInBits() < 32)) { IRBuilder<> Builder(&I); llvm::Value *Src = CastOp->getOperand(0); - auto TmpTy = - llvm::FixedVectorType::get(llvm::Type::getInt32Ty(DstTy->getContext()), - cast(DstTy)->getNumElements()); + auto TmpTy = llvm::FixedVectorType::get( + llvm::Type::getInt32Ty(DstTy->getContext()), + cast(DstTy)->getNumElements()); Src = Builder.CreateFPToSI(Src, TmpTy); llvm::Instruction::CastOps TruncOp = llvm::Instruction::Trunc;