From 4e5ce57bb1c5f63e8ac18cf0012782a4fbeb9d8f Mon Sep 17 00:00:00 2001 From: Mohammad Fawaz Date: Thu, 25 Jun 2020 20:07:41 -0400 Subject: [PATCH 1/4] [SYCL][FPGA] Adding the new header fpga_lsu.hpp that contains wrappers around the Clang builtin __builtin_intel_fpga_mem --- sycl/doc/extensions/IntelFPGA/FPGALsu.md | 112 +++++++++++++++++ .../include/CL/sycl/intel/fpga_extensions.hpp | 1 + sycl/include/CL/sycl/intel/fpga_lsu.hpp | 116 ++++++++++++++++++ sycl/include/CL/sycl/intel/fpga_utils.hpp | 34 +++++ sycl/test/fpga_tests/fpga_lsu.cpp | 77 ++++++++++++ 5 files changed, 340 insertions(+) create mode 100644 sycl/doc/extensions/IntelFPGA/FPGALsu.md create mode 100644 sycl/include/CL/sycl/intel/fpga_lsu.hpp create mode 100644 sycl/include/CL/sycl/intel/fpga_utils.hpp create mode 100644 sycl/test/fpga_tests/fpga_lsu.cpp diff --git a/sycl/doc/extensions/IntelFPGA/FPGALsu.md b/sycl/doc/extensions/IntelFPGA/FPGALsu.md new file mode 100644 index 0000000000000..ed043caaa7b06 --- /dev/null +++ b/sycl/doc/extensions/IntelFPGA/FPGALsu.md @@ -0,0 +1,112 @@ + +# FPGA lsu + +The Intel FPGA `lsu` class is implemented in `CL/sycl/intel/fpga_lsu.hpp` which +is included in `CL/sycl/intel/fpga_extensions.hpp`. + +The class `cl::sycl::intel::lsu` allows users to explicitly request that the +implementation of a global memory access is configured in a certain way. The +class has two member functions, `load()` and `store()` which allow loading from +and storing to a `global_ptr`, respectively, and is templated on the following +4 optional paremeters: + +1. **`cl::sycl::intel::burst_coalesce`, where `N` is 0 or 1**: request, to +the extent possible, that a dynamic burst coalescer be implemented when `load` +or `store` are called. +2. **`cl::sycl::intel::cache`, where `N` is an integer greater or equal to +0**: request, to the extent possible, that a read-only cache of the specified +size be implemented when when `load` is called. It is not allowed to use that +parameter for `store`. +3. **`cl::sycl::intel::dont_statically_coalesce`, where `N` is 0 or 1**: +request, to the extent possible, that `load` or `store` accesses, should not be +statically coalesced with other memory accesses at compile time. +4. **`cl::sycl::intel::prefetch`, where `N` is 0 or 1**: request, to the +extent possible, that a prefetcher be implemented when `load` is called. It is +not allowed to use that parameter for `store`. + +Currently, not every combination of parameters is allowed due to limitations in +the backend. The following rules apply: +1. For `store`, `cl::sycl::intel::cache` and `cl::sycl::intel::prefetch` must +be `0`. +2. For `load`, if `cl::sycl::intel::cache` is set to a value greater than `0`, +then `cl::sycl::intel::burst_coalesce` must be set to `1`. +3. For `load`, exactly one of `cl::sycl::intel::prefetch` and +`cl::sycl::intel::burst_coalesce` is allowed to be `1`. +4. For `load`, exactly one of `cl::sycl::intel::prefetch` and +`cl::sycl::intel::cache` is allowed to be 1. + +## Implementation + +The implementation relies on the Clang built-in `__builtin_intel_fpga_mem` when +parsing the SYCL device code. The built-in uses the LLVM `ptr.annotation` +intrinsic under the hood to annotate the pointer that is being accessed. +```c++ +template class lsu final { +public: + lsu() = delete; + + template static T &load(sycl::global_ptr Ptr) { + check_load(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + return *__builtin_intel_fpga_mem((T *)Ptr, _burst_coalesce | _cache | + _dont_statically_coalesce | + _prefetch, _cache_size); +#else + return *Ptr; +#endif + } + + template static void store(sycl::global_ptr Ptr, T Val) { + check_store(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + *__builtin_intel_fpga_mem((T *)Ptr, _burst_coalesce | _cache | + _dont_statically_coalesce | + _prefetch, _cache_size) = Val; +#else + *Ptr = Val; +#endif + } + ... +} +``` + +## Usage + +```c++ +#include +... +cl::sycl::buffer output_buffer(output_data, 1); +cl::sycl::buffer input_buffer(input_data, 1); + +Queue.submit([&](cl::sycl::handler &cgh) { + auto output_accessor = output_buffer.get_access(cgh); + auto input_accessor = input_buffer.get_access(cgh); + + cgh.single_task([=]() { + auto input_ptr = input_accessor.get_pointer(); + auto output_ptr = output_accessor.get_pointer(); + + using PrefetchingLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::dont_statically_coalesce<1>>; + + using BurstCoalescedLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::dont_statically_coalesce<1>>; + + using CachingLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::cache<1024>, + cl::sycl::intel::dont_statically_coalesce<0>>; + + using PipelinedLSU = cl::sycl::intel::lsu<>; + + int X = PrefetchingLSU::load(input_ptr); // int X = input_ptr[0] + int Y = CachingLSU::load(input_ptr + 1); // int Y = input_ptr[1] + + BurstCoalescedLSU::store(output_ptr, X); // output_ptr[0] = X + PipelinedLSU::store(output_ptr + 1, Y); // output_ptr[1] = Y + }); +}); +... +``` diff --git a/sycl/include/CL/sycl/intel/fpga_extensions.hpp b/sycl/include/CL/sycl/intel/fpga_extensions.hpp index a9fca1e6139d2..7140421fe5189 100644 --- a/sycl/include/CL/sycl/intel/fpga_extensions.hpp +++ b/sycl/include/CL/sycl/intel/fpga_extensions.hpp @@ -8,5 +8,6 @@ #pragma once #include +#include #include #include diff --git a/sycl/include/CL/sycl/intel/fpga_lsu.hpp b/sycl/include/CL/sycl/intel/fpga_lsu.hpp new file mode 100644 index 0000000000000..527d9e7b790b3 --- /dev/null +++ b/sycl/include/CL/sycl/intel/fpga_lsu.hpp @@ -0,0 +1,116 @@ +//==-------------- fpga_lsu.hpp --- SYCL FPGA Reg Extensions ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "fpga_utils.hpp" +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace intel { +constexpr unsigned BURST_COALESCE = 0x1; +constexpr unsigned CACHE = 0x2; +constexpr unsigned DONT_STATICALLY_COALESCE = 0x4; +constexpr unsigned PREFETCH = 0x8; + +template struct burst_coalesce { + static constexpr int value = N; + static constexpr int default_value = false; +}; + +template struct cache { + static constexpr int value = N; + static constexpr int default_value = 0; +}; + +template struct prefetch { + static constexpr int value = N; + static constexpr int default_value = false; +}; + +template struct dont_statically_coalesce { + static constexpr int value = N; + static constexpr int default_value = 0; +}; + +template class lsu final { +public: + lsu() = delete; + + template static T &load(sycl::global_ptr Ptr) { + check_load(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + return *__builtin_intel_fpga_mem((T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val); +#else + return *Ptr; +#endif + } + + template static void store(sycl::global_ptr Ptr, T Val) { + check_store(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + *__builtin_intel_fpga_mem((T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val) = Val; +#else + *Ptr = Val; +#endif + } + +private: + static constexpr int _burst_coalesce_val = + GetValue::value; + static constexpr unsigned _burst_coalesce = + _burst_coalesce_val == 1 ? BURST_COALESCE : 0; + + static constexpr int _cache_val = + GetValue::value; + static constexpr unsigned _cache = (_cache_val > 0) ? CACHE : 0; + + static constexpr unsigned _dont_statically_coalesce_val = + GetValue::value; + static constexpr unsigned _dont_statically_coalesce = + _dont_statically_coalesce_val == 1 ? DONT_STATICALLY_COALESCE : 0; + + static constexpr unsigned _prefetch_val = + GetValue::value; + static constexpr unsigned _prefetch = _prefetch_val ? PREFETCH : 0; + + static_assert(_burst_coalesce_val == 0 || _burst_coalesce_val == 1, + "burst_coalesce parameter must be 0 or 1"); + static_assert(_cache_val >= 0, "cache size parameter must be non-negative"); + static_assert(_dont_statically_coalesce_val == 0 || + _dont_statically_coalesce_val == 1, + "dont_statically_coalesce parameter must be 0 or 1"); + static_assert(_prefetch_val == 0 || _prefetch_val == 1, + "prefetch parameter must be 0 or 1"); + + static void check_load() { + static_assert(_cache == 0 || _burst_coalesce == BURST_COALESCE, + "unable to implement a cache without a burst coalescer"); + static_assert(_prefetch == 0 || _burst_coalesce == 0, + "unable to implement a prefetcher and a burst coalescer " + "simulataneously"); + static_assert( + _prefetch == 0 || _cache == 0, + "unable to implement a prefetcher and a cache simulataneously"); + } + static void check_store() { + static_assert(_cache == 0, "unable to implement a store LSU with a cache."); + static_assert(_prefetch == 0, + "unable to implement a store LSU with a prefetcher."); + } +}; +} // namespace intel +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_utils.hpp b/sycl/include/CL/sycl/intel/fpga_utils.hpp new file mode 100644 index 0000000000000..79ac4cc7b720d --- /dev/null +++ b/sycl/include/CL/sycl/intel/fpga_utils.hpp @@ -0,0 +1,34 @@ +//==------------- fpga_utils.hpp --- SYCL FPGA Reg Extensions --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace intel { + +template