From bc9f0aa877f61ec6a163f2b53913c69010e11401 Mon Sep 17 00:00:00 2001 From: "Niu, Shuo" Date: Wed, 24 Nov 2021 12:52:10 -0800 Subject: [PATCH] Add experimental latency control API to FPGA extension docs --- .../data_flow_pipes_rev4_proposed.asciidoc | 88 +++++++++ .../IntelFPGA/FPGALsu_rev2_proposed.md | 176 +++++++++++++++++- 2 files changed, 262 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc b/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc index d8eb67072d4ed..4b8cbc5dffb1f 100755 --- a/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc +++ b/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc @@ -634,6 +634,92 @@ Automated mechanisms are possible to provide uniquification across calls, and co *RESOLUTION*: Resolved. Abstraction/libraries on top enable functionality like this. We will make public a library that enables arrays of pipes. -- +== Experimental APIs + +*NOTE*: The APIs described in this section are experimental. Future versions of this extension may change these APIs in ways that are incompatible with the versions described here. + +In the experimental API version, read/write methods take template arguments, which can contain the latency control properties `latency_anchor_id` and/or `latency_constraint`. + +* `sycl::ext::intel::experimental::latency_anchor_id`, where `N` is an integer: An ID to associate with the current read/write function call, which can then be referenced by other `latency_constraint` properties elsewhere in the program to define relative latency constaints. ID must be unique within the application, and a diagnostic is required if that condition is not met. +* `sycl::ext::intel::experimental::latency_constraint`: A tuple of three values which cause the current read/write function call to act as an endpoint of a latency constraint relative to a specified `latency_anchor_id` defined by a different instruction. +** `A` is an integer: The ID of the target anchor defined on a different instruction through a `latency_anchor_id` property. +** `B` is an enum value: The type of control from the set {`type::exact`, `type::max`, `type::min`}. +** `C` is an integer: The relative clock cycle difference between the target anchor and the current function call, that the constraint should infer subject to the type of the control (exact, max, min). + +The template arguments above don't have to be specified if user doesn't want to apply latency controls. The template arguments can be passed in arbitrary order. + +=== Implementation + +[source,c++] +---- +// Added in version 2 of this extension. +namespace sycl::ext::intel::experimental { +enum class type { + none, // default + exact, + max, + min +}; + +template struct latency_anchor_id { + static constexpr int32_t value = _N; + static constexpr int32_t default_value = -1; +}; + +template struct latency_constraint { + static constexpr std::tuple value = {_N1, _N2, _N3}; + static constexpr std::tuple default_value = { + 0, type::none, 0}; +}; + +template +class pipe { + // Blocking + template + static dataT read(); + template + static void write( const dataT &data ); + + // Non-blocking + template + static dataT read( bool &success_code ); + template + static void write( const dataT &data, bool &success_code ); +} +} // namespace sycl::ext::intel::experimental +---- + +=== Usage + +[source,c++] +---- +// Added in version 2 of this extension. +#include +... +using Pipe1 = ext::intel::experimental::pipe; +using Pipe2 = ext::intel::experimental::pipe; +using Pipe3 = ext::intel::experimental::pipe; + +myQueue.submit([&](handler &cgh) { + cgh.single_task([=] { + // The following Pipe1::read is anchor 0 + int value = Pipe1::read>(); + + // The following Pipe2::write is anchor 1 + // The following Pipe2::write occurs exactly 2 cycles after anchor 0 + Pipe2::write, + ext::intel::experimental::latency_constraint< + 0, ext::intel::experimental::type::exact, 2>>(value); + + // The following Pipe3::write occurs at least 2 cycles after anchor 1 + Pipe3::write>(value); + }); +}); +---- + == Feature test macro This extension provides a feature-test macro as described in the core SYCL @@ -648,6 +734,7 @@ extension's APIs the implementation supports. |=== |Value |Description |1 |Initial extension version. Base features are supported. +|2 |Add experimental latency control API. |=== == Revision History @@ -660,6 +747,7 @@ extension's APIs the implementation supports. |1|2019-09-12|Michael Kinsner|*Initial public working draft* |2|2019-11-13|Michael Kinsner|Incorporate feedback |3|2020-04-27|Michael Kinsner|Clarify that pipe operations behave as-if they are relaxed atomic operations. Make SYCL2020 the baseline +|4|2021-12-02|Shuo Niu|Add experimental latency control API |======================================== //************************************************************************ diff --git a/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md b/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md index e79cafe7560e5..ad85bee3af4bf 100644 --- a/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md +++ b/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md @@ -119,6 +119,177 @@ Queue.submit([&](cl::sycl::handler &cgh) { ... ``` +## Experimental APIs + +**NOTE**: The APIs described in this section are experimental. Future versions of +this extension may change these APIs in ways that are incompatible with the +versions described here. + +In the experimental API version, member functions `load()` and `store()` take +template arguments, which can contain the latency control properties +`latency_anchor_id` and/or `latency_constraint`. + +1. **`sycl::ext::intel::experimental::latency_anchor_id`, where `N` is an integer**: +represents ID of the current function call when it performs as an anchor. The ID +must be unique within the application, with a diagnostic required if that +condition is not met. +2. **`sycl::ext::intel::experimental::latency_constraint`** contains control +parameters when the current function performs as a non-anchor, where: + - **`A` is an integer**: The ID of the target anchor defined on a different + instruction through a `latency_anchor_id` property. + - **`B` is an enum value**: The type of control from the set + {`type::exact`, `type::max`, `type::min`}. + - **`C` is an integer**: The relative clock cycle difference between the + target anchor and the current function call, that the constraint should + infer subject to the type of the control (exact, max, min). + +The template arguments above don't have to be specified if user doesn't want to +apply latency controls. The template arguments can be passed in arbitrary order. + +### Implementation +```c++ +// Added in version 2 of this extension. +namespace sycl::ext::intel::experimental { +enum class type { + none, // default + exact, + max, + min +}; + +template struct latency_anchor_id { + static constexpr int32_t value = _N; + static constexpr int32_t default_value = -1; +}; + +template struct latency_constraint { + static constexpr std::tuple value = {_N1, _N2, _N3}; + static constexpr std::tuple default_value = { + 0, type::none, 0}; +}; + +template class lsu final { +public: + lsu() = delete; + + template + static _T load(sycl::multi_ptr<_T, _space> Ptr) { + check_space<_space>(); + check_load(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + static constexpr auto _anchor_id = + __GetValue::value; + static constexpr auto _constraint = + __GetValue3::value; + + static constexpr int _target_anchor = std::get<0>(_constraint); + static constexpr type _control_type = std::get<1>(_constraint); + static constexpr int _cycle = std::get<2>(_constraint); + int _type; + if (_control_type == type::none) { + _type = 0; + } else if (_control_type == type::exact) { + _type = 1; + } else if (_control_type == type::max) { + _type = 2; + } else { // _control_type == type::min + _type = 3; + } + + return *__latency_control_mem_wrapper((_T *)Ptr, _anchor_id, _target_anchor, + _type, _cycle); +#else + return *Ptr; +#endif + } + + template + static void store(sycl::multi_ptr<_T, _space> Ptr, _T Val) { + check_space<_space>(); + check_store(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + static constexpr auto _anchor_id = + __GetValue::value; + static constexpr auto _constraint = + __GetValue3::value; + + static constexpr int _target_anchor = std::get<0>(_constraint); + static constexpr type _control_type = std::get<1>(_constraint); + static constexpr int _cycle = std::get<2>(_constraint); + int _type; + if (_control_type == type::none) { + _type = 0; + } else if (_control_type == type::exact) { + _type = 1; + } else if (_control_type == type::max) { + _type = 2; + } else { // _control_type == type::min + _type = 3; + } + + *__latency_control_mem_wrapper((_T *)Ptr, _anchor_id, _target_anchor, _type, + _cycle) = Val; +#else + *Ptr = Val; +#endif + } + ... +private: +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + template + static _T *__latency_control_mem_wrapper(_T * Ptr, int32_t AnchorID, + int32_t TargetAnchor, int32_t Type, + int32_t Cycle) { + return __builtin_intel_fpga_mem(Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val); + } +#endif + ... +} +} // namespace sycl::ext::intel::experimental +``` + +### Usage +```c++ +// Added in version 2 of this extension. +#include +... +sycl::buffer output_buffer(output_data, 1); +sycl::buffer input_buffer(input_data, 1); +Queue.submit([&](sycl::handler &cgh) { + auto output_accessor = + output_buffer.get_access(cgh); + auto input_accessor = input_buffer.get_access(cgh); + + cgh.single_task([=] { + auto input_ptr = input_accessor.get_pointer(); + auto output_ptr = output_accessor.get_pointer(); + + // latency controls + using ExpPrefetchingLSU = sycl::ext::intel::experimental::lsu< + sycl::ext::intel::experimental::prefetch, + sycl::ext::intel::experimental::statically_coalesce>; + + using ExpBurstCoalescedLSU = sycl::ext::intel::experimental::lsu< + sycl::ext::intel::experimental::burst_coalesce, + sycl::ext::intel::experimental::statically_coalesce>; + + // The following load is anchor 1 + int Z = ExpPrefetchingLSU::load< + sycl::ext::intel::experimental::latency_anchor_id<1>>(input_ptr + 2); + + // The following store occurs exactly 5 cycles after the anchor 1 read + ExpBurstCoalescedLSU::store< + sycl::ext::intel::experimental::latency_constraint< + 1, sycl::ext::intel::experimental::type::exact, 5>>(output_ptr + 2, + Z); + }); +}); +... +``` + ## Feature Test Macro This extension provides a feature-test macro as described in the core SYCL @@ -126,9 +297,10 @@ specification section 6.3.3 "Feature test macros". Therefore, an implementation supporting this extension must predefine the macro `SYCL_EXT_INTEL_FPGA_LSU` to one of the values defined in the table below. Applications can test for the existence of this macro to determine if the implementation supports this -feature, or applications can test the macro�s value to determine which of the -extension�s APIs the implementation supports. +feature, or applications can test the macro's value to determine which of the +extension's APIs the implementation supports. |Value |Description| |:---- |:---------:| |1 |Initial extension version. Base features are supported.| +|2 |Add experimental latency control API.|