From 09282d64a9f2ec2a51a26352d4035b3764f54447 Mon Sep 17 00:00:00 2001 From: Mohammad Fawaz Date: Wed, 22 Jul 2020 10:54:54 -0400 Subject: [PATCH] New spec for controlling load-store units in FPGAs --- sycl/doc/extensions/IntelFPGA/FPGALsu.md | 116 +++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 sycl/doc/extensions/IntelFPGA/FPGALsu.md diff --git a/sycl/doc/extensions/IntelFPGA/FPGALsu.md b/sycl/doc/extensions/IntelFPGA/FPGALsu.md new file mode 100644 index 0000000000000..7a412128d9afc --- /dev/null +++ b/sycl/doc/extensions/IntelFPGA/FPGALsu.md @@ -0,0 +1,116 @@ + +# FPGA lsu + +The Intel FPGA `lsu` class is implemented in `CL/sycl/intel/fpga_lsu.hpp` which +is included in `CL/sycl/intel/fpga_extensions.hpp`. + +The class `cl::sycl::intel::lsu` allows users to explicitly request that the +implementation of a global memory access is configured in a certain way. The +class has two member functions, `load()` and `store()` which allow loading from +and storing to a `global_ptr`, respectively, and is templated on the following +4 optional paremeters: + +1. **`cl::sycl::intel::burst_coalesce`, where `B` is a boolean**: request, +to the extent possible, that a dynamic burst coalescer be implemented when +`load` or `store` are called. The default value of this parameter is `false`. +2. **`cl::sycl::intel::cache`, where `N` is an integer greater or equal to +0**: request, to the extent possible, that a read-only cache of the specified +size in bytes be implemented when when `load` is called. It is not allowed to +use that parameter for `store`. The default value of this parameter is `0`. +3. **`cl::sycl::intel::statically_coalesce`, where `B` is a boolean**: +request, to the extent possible, that `load` or `store` accesses, is allowed to +be statically coalesced with other memory accesses at compile time. The default +value of this parameter is `true`. +4. **`cl::sycl::intel::prefetch`, where `N` is a boolean**: request, to the +extent possible, that a prefetcher be implemented when `load` is called. It is +not allowed to use that parameter for `store`. The default value of this +parameter is `false`. + +Currently, not every combination of parameters is allowed due to limitations in +the backend. The following rules apply: +1. For `store`, `cl::sycl::intel::cache` must be `0` and +`cl::sycl::intel::prefetch` must be `false`. +2. For `load`, if `cl::sycl::intel::cache` is set to a value greater than `0`, +then `cl::sycl::intel::burst_coalesce` must be set to `true`. +3. For `load`, exactly one of `cl::sycl::intel::prefetch` and +`cl::sycl::intel::burst_coalesce` is allowed to be `true`. +4. For `load`, exactly one of `cl::sycl::intel::prefetch` and +`cl::sycl::intel::cache` is allowed to be `true`. + +## Implementation + +The implementation relies on the Clang built-in `__builtin_intel_fpga_mem` when +parsing the SYCL device code. The built-in uses the LLVM `ptr.annotation` +intrinsic under the hood to annotate the pointer that is being accessed. +```c++ +template class lsu final { +public: + lsu() = delete; + + template static T &load(sycl::global_ptr Ptr) { + check_load(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + return *__builtin_intel_fpga_mem((T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val); +#else + return *Ptr; +#endif + } + + template static void store(sycl::global_ptr Ptr, T Val) { + check_store(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + *__builtin_intel_fpga_mem((T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val) = Val; +#else + *Ptr = Val; +#endif + } + ... +} +``` + +## Usage + +```c++ +#include +... +cl::sycl::buffer output_buffer(output_data, 1); +cl::sycl::buffer input_buffer(input_data, 1); + +Queue.submit([&](cl::sycl::handler &cgh) { + auto output_accessor = output_buffer.get_access(cgh); + auto input_accessor = input_buffer.get_access(cgh); + + cgh.single_task([=] { + auto input_ptr = input_accessor.get_pointer(); + auto output_ptr = output_accessor.get_pointer(); + + using PrefetchingLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::statically_coalesce>; + + using BurstCoalescedLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::statically_coalesce>; + + using CachingLSU = + cl::sycl::intel::lsu, + cl::sycl::intel::cache<1024>, + cl::sycl::intel::statically_coalesce>; + + using PipelinedLSU = cl::sycl::intel::lsu<>; + + int X = PrefetchingLSU::load(input_ptr); // int X = input_ptr[0] + int Y = CachingLSU::load(input_ptr + 1); // int Y = input_ptr[1] + + BurstCoalescedLSU::store(output_ptr, X); // output_ptr[0] = X + PipelinedLSU::store(output_ptr + 1, Y); // output_ptr[1] = Y + }); +}); +... +```