From 2365dc8296c8e515bd8bb6c9570bfe7bdc8db790 Mon Sep 17 00:00:00 2001 From: Greg Lueck Date: Tue, 27 Jun 2023 17:24:35 -0400 Subject: [PATCH 1/4] [SYCL][DOC] Update dot_acc extension spec Update the sycl_ext_oneapi_dot_accumulate extension spec to: * Use the latest specification template. * Document the "packed" APIs. These were previously shown in the "sample header" section, but there was no description. * Deprecate the old "packed" overloads and introduce new overloads with the more descriptive name `doc_acc_4x8packed`. This new name is consistent with the OpenCL C naming style for similar functions. --- .../sycl_ext_oneapi_dot_accumulate.asciidoc | 174 ++++++++++-------- 1 file changed, 101 insertions(+), 73 deletions(-) diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc index e77fce3b5071d..86bc199918e7e 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc @@ -1,74 +1,99 @@ -= SYCL_INTEL_dot_accumulate += sycl_ext_oneapi_dot_accumulate + :source-highlighter: coderay :coderay-linenums-mode: table + +// This section needs to be after the document title. :doctype: book +:toc2: +:toc: left :encoding: utf-8 :lang: en +:dpcpp: pass:[DPC++] -:blank: pass:[ +] +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} -// Set the default source code type in this document to C, -// for syntax highlighting purposes. -:language: c -// This is what is needed for C++, since docbook uses c++ -// and everything else uses cpp. This doesn't work when -// source blocks are in table cells, though, so don't use -// C++ unless it is required. -//:language: {basebackend@docbook:c++:cpp} +== Notice + +[%hardbreaks] +Copyright (C) 2020-2023 Intel Corporation. All rights reserved. -== Introduction +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. -IMPORTANT: This specification is a draft. -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. +== Contact -NOTE: This document is better viewed when rendered as html with asciidoctor. GitHub does not render image icons. +To report problems with this extension, please open a new issue at: -== Name Strings +https://github.com/intel/llvm/issues -`SYCL_ONEAPI_dot_accumulate` -This is a placeholder name. +== Dependencies -== Notice +This extension is written against the SYCL 2020 revision 7 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. -Copyright (c) 2020 Intel Corporation. All rights reserved. == Status -Working Draft +This extension is implemented and fully supported by {dpcpp}. -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension. -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. +== Overview -== Version +This extension adds new SYCL built-in functions that may simplify development +and provide access specialized hardware instructions when a SYCL kernel needs +to perform a dot product of two vectors followed by a scalar accumulation. -Built On: {docdate} + -Revision: 3 -== Contact +== Specification -Ben Ashbaugh, Intel (ben 'dot' ashbaugh 'at' intel 'dot' com) +=== Feature test macro -== Dependencies +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_DOT_ACCUMULATE` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. -This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6. +[%header,cols="1,5"] +|=== +|Value +|Description -== Overview +|1 +|Initial version of this extension. +|=== -This extension adds new SYCL built-in functions that may simplify development and provide access specialized hardware instructions when a SYCL kernel needs to perform a dot product of two vectors followed by a scalar accumulation. +=== New functions to compute the dot product of vectors -== Enabling the extension +This extension adds the following free functions: -The extension is always enabled. The dot product functionality may be emulated in software or executed using hardware when suitable instructions are available. +[source,c++] +---- +namespace sycl::ext::oneapi { -== Modifications of SYCL 1.2.1 specification +int32_t dot_acc(vec a, vec b, int32_t c); +int32_t dot_acc(vec a, vec b, int32_t c); +int32_t dot_acc(vec a, vec b, int32_t c); +int32_t dot_acc(vec a, vec b, int32_t c); -=== Add to Section 4.13.6 - Geometric Functions +int32_t doc_acc_4x8packed(int32_t a, int32_t b, int32_t c); +int32_t doc_acc_4x8packed(int32_t a, uint32_t b, int32_t c); +int32_t doc_acc_4x8packed(uint32_t a, int32_t b, int32_t c); +int32_t doc_acc_4x8packed(uint32_t a, uint32_t b, int32_t c); -Additionally, the following additional functions are available in the namespace `sycl::intel` on the host and device. +} // namespace sycl::ext::oneapi +---- [cols="4a,4",options="header"] |==== @@ -95,56 +120,59 @@ int32_t dot_acc(vec a, {blank} The value that is returned is equivalent to + {blank} -*dot*(_a_, _b_) + _c_ +`dot(a, b) + c` + +|[source,c] +---- +int32_t doc_acc_4x8packed(int32_t a, + int32_t b, + int32_t c) +int32_t doc_acc_4x8packed(int32_t a, + uint32_t b, + int32_t c) +int32_t doc_acc_4x8packed(uint32_t a, + int32_t b, + int32_t c) +int32_t doc_acc_4x8packed(uint32_t a, + uint32_t b, + int32_t c); +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` and `b` are 32-bit integers that represent a vector of 4 8-bit elements. +When the type of `a` or `b` is `int32_t`, it is interpreted as `vec`. +When the type of `a` or `b` is `uint32_t`, it is interpreted as +`vec`. In each case, the least significant byte is element 0, and +the most significant byte is element 3. |==== -== Sample Header +=== Deprecated functions + +The following functions are deprecated. They have the same effect as the +`doc_acc_4x8packed` overloads described above. [source,c++] ---- -namespace cl { -namespace sycl { -namespace ext { -namespace oneapi { - -int32_t dot_acc(vec a, vec b, int32_t c); -int32_t dot_acc(vec a, vec b, int32_t c); -int32_t dot_acc(vec a, vec b, int32_t c); -int32_t dot_acc(vec a, vec b, int32_t c); +namespace sycl::ext::oneapi { int32_t dot_acc(int32_t a, int32_t b, int32_t c); int32_t dot_acc(int32_t a, uint32_t b, int32_t c); int32_t dot_acc(uint32_t a, int32_t b, int32_t c); int32_t dot_acc(uint32_t a, uint32_t b, int32_t c); -} // oneapi -} // ext -} // sycl -} // cl +} // namespace sycl::ext::oneapi ---- + == Issues -None. - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2019-12-13|Ben Ashbaugh|*Initial draft* -|2|2019-12-18|Ben Ashbaugh|Switched to standard C++ fixed width types. -|3|2020-10-26|Rajiv Deodhar|Added int32 types. -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use `mono` text for device APIs, or [source] syntax highlighting. -//* Use `mono` text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ +* The overloads that take two unsigned vectors have a signed `c` and return a + signed result. This is inconsistent with the SPIR-V primitives and the + OpenCL C APIs, both of which return an unsigned value in this case and expect + an unsigned `c`. I think we could implement the APIs as they are using the + SPIR-V primitives, but the extra unsigned-to-signed conversions might + generate less efficient code (I haven't checked). Is there a compelling + reason to keep these APIs as they are now? If not, we could deprecate them + and introduce overloads that take an unsigned `c` and return an unsigned + value. From 84f1ff887f8c5cc725ea3248330dd52b5844ea82 Mon Sep 17 00:00:00 2001 From: Greg Lueck Date: Thu, 27 Jul 2023 16:57:50 -0400 Subject: [PATCH 2/4] Address review feedback * Deprecate the overload taking two unsigned vectors and returning a signed result and replace it with a version returning an unsigned result. Addresses the open issue. * Change the names of the new "packed" APIs to include a suffix which tells how the `a` and `b` vectors are interpreted (signed vs. unsigned). * Change the new "packed" APIs so that `a` and `b` are always unsigned integers. The name of the function now tells how to interpret them. --- .../sycl_ext_oneapi_dot_accumulate.asciidoc | 152 +++++++++++++----- 1 file changed, 110 insertions(+), 42 deletions(-) diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc index 86bc199918e7e..ba5d109be72c8 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc @@ -85,12 +85,12 @@ namespace sycl::ext::oneapi { int32_t dot_acc(vec a, vec b, int32_t c); int32_t dot_acc(vec a, vec b, int32_t c); int32_t dot_acc(vec a, vec b, int32_t c); -int32_t dot_acc(vec a, vec b, int32_t c); +uint32_t dot_acc(vec a, vec b, uint32_t c); -int32_t doc_acc_4x8packed(int32_t a, int32_t b, int32_t c); -int32_t doc_acc_4x8packed(int32_t a, uint32_t b, int32_t c); -int32_t doc_acc_4x8packed(uint32_t a, int32_t b, int32_t c); -int32_t doc_acc_4x8packed(uint32_t a, uint32_t b, int32_t c); +int32_t doc_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c); +int32_t doc_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c); +int32_t doc_acc_4x8packed_us(uint32_t a, uint32_t b, int32_t c); +uint32_t doc_acc_4x8packed_uu(uint32_t a, uint32_t b, uint32_t c); } // namespace sycl::ext::oneapi ---- @@ -111,51 +111,71 @@ int32_t dot_acc(vec a, int32_t dot_acc(vec a, vec b, int32_t c) -int32_t dot_acc(vec a, - vec b, - int32_t c) +uint32_t dot_acc(vec a, + vec b, + uint32_t c) +---- + +|Performs a four-component integer dot product accumulate operation. The value +that is returned is equivalent to `dot(a, b) + c`, where `dot` computes the +dot product of two vectors. + +|[source,c] +---- +int32_t doc_acc_4x8packed_ss(uint32_t a, + uint32_t b, + int32_t c) +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` and `b` are both interpreted as `vec`. + +|[source,c] +---- +int32_t doc_acc_4x8packed_su(uint32_t a, + uint32_t b, + int32_t c) ---- -|Performs a four-component integer dot product accumulate operation. + -{blank} -The value that is returned is equivalent to + -{blank} -`dot(a, b) + c` +|Performs a four-component integer dot product accumulate operation, where +`a` is interpreted as `vec` and `b` is interpreted as +`vec`. |[source,c] ---- -int32_t doc_acc_4x8packed(int32_t a, - int32_t b, - int32_t c) -int32_t doc_acc_4x8packed(int32_t a, - uint32_t b, - int32_t c) -int32_t doc_acc_4x8packed(uint32_t a, - int32_t b, - int32_t c) -int32_t doc_acc_4x8packed(uint32_t a, - uint32_t b, - int32_t c); +int32_t doc_acc_4x8packed_us(uint32_t a, + uint32_t b, + int32_t c) ---- |Performs a four-component integer dot product accumulate operation, where -`a` and `b` are 32-bit integers that represent a vector of 4 8-bit elements. -When the type of `a` or `b` is `int32_t`, it is interpreted as `vec`. -When the type of `a` or `b` is `uint32_t`, it is interpreted as -`vec`. In each case, the least significant byte is element 0, and -the most significant byte is element 3. +`a` is interpreted as `vec` and `b` is interpreted as +`vec`. +|[source,c] +---- +uint32_t doc_acc_4x8packed_uu(uint32_t a, + uint32_t b, + uint32_t c); +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` and `b` are both interpreted as `vec`. |==== +For all the "packed" overloads, the least significant byte of the integer is +element 0, and the most significant byte is element 3. + === Deprecated functions -The following functions are deprecated. They have the same effect as the -`doc_acc_4x8packed` overloads described above. +The following functions are deprecated. [source,c++] ---- namespace sycl::ext::oneapi { +int32_t dot_acc(vec a, vec b, int32_t c); + int32_t dot_acc(int32_t a, int32_t b, int32_t c); int32_t dot_acc(int32_t a, uint32_t b, int32_t c); int32_t dot_acc(uint32_t a, int32_t b, int32_t c); @@ -164,15 +184,63 @@ int32_t dot_acc(uint32_t a, uint32_t b, int32_t c); } // namespace sycl::ext::oneapi ---- +[cols="4a,4",options="header"] +|==== +| *Function* +| *Description* -== Issues +|[source,c] +---- +int32_t dot_acc(vec a, + vec b, + int32_t c) +---- + +|Performs a four-component integer dot product accumulate operation, where the +elements of `a` and `b` are unsigned while `c` is signed. Use the version +taking an unsigned `c` instead. + +|[source,c] +---- +int32_t dot_acc(int32_t a, + int32_t b, + int32_t c) +---- -* The overloads that take two unsigned vectors have a signed `c` and return a - signed result. This is inconsistent with the SPIR-V primitives and the - OpenCL C APIs, both of which return an unsigned value in this case and expect - an unsigned `c`. I think we could implement the APIs as they are using the - SPIR-V primitives, but the extra unsigned-to-signed conversions might - generate less efficient code (I haven't checked). Is there a compelling - reason to keep these APIs as they are now? If not, we could deprecate them - and introduce overloads that take an unsigned `c` and return an unsigned - value. +|Performs a four-component integer dot product accumulate operation, where +`a` and `b` are both interpreted as `vec`. Use +`doc_acc_4x8packed_ss` instead. + +|[source,c] +---- +int32_t dot_acc(int32_t a, + uint32_t b, + int32_t c) +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` is interpreted as `vec` and `b` is interpreted as +`vec`. Use `doc_acc_4x8packed_su` instead. + +|[source,c] +---- +int32_t dot_acc(uint32_t a, + int32_t b, + int32_t c) +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` is interpreted as `vec` and `b` is interpreted as +`vec`. Use `doc_acc_4x8packed_us` instead. + +|[source,c] +---- +int32_t dot_acc(uint32_t a, + uint32_t b, + int32_t c) +---- + +|Performs a four-component integer dot product accumulate operation, where +`a` and `b` are both interpreted as `vec`. Use +`doc_acc_4x8packed_uu` instead. +|==== From 995e7834b0a5b830a0fe4f25f0e3cd7dc8415eac Mon Sep 17 00:00:00 2001 From: Greg Lueck Date: Tue, 8 Oct 2024 11:46:44 -0400 Subject: [PATCH 3/4] Fix typo doc -> dot --- .../sycl_ext_oneapi_dot_accumulate.asciidoc | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc index ba5d109be72c8..45fa243844a3d 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc @@ -87,10 +87,10 @@ int32_t dot_acc(vec a, vec b, int32_t c); int32_t dot_acc(vec a, vec b, int32_t c); uint32_t dot_acc(vec a, vec b, uint32_t c); -int32_t doc_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c); -int32_t doc_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c); -int32_t doc_acc_4x8packed_us(uint32_t a, uint32_t b, int32_t c); -uint32_t doc_acc_4x8packed_uu(uint32_t a, uint32_t b, uint32_t c); +int32_t dot_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c); +int32_t dot_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c); +int32_t dot_acc_4x8packed_us(uint32_t a, uint32_t b, int32_t c); +uint32_t dot_acc_4x8packed_uu(uint32_t a, uint32_t b, uint32_t c); } // namespace sycl::ext::oneapi ---- @@ -122,7 +122,7 @@ dot product of two vectors. |[source,c] ---- -int32_t doc_acc_4x8packed_ss(uint32_t a, +int32_t dot_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c) ---- @@ -132,7 +132,7 @@ int32_t doc_acc_4x8packed_ss(uint32_t a, |[source,c] ---- -int32_t doc_acc_4x8packed_su(uint32_t a, +int32_t dot_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c) ---- @@ -143,7 +143,7 @@ int32_t doc_acc_4x8packed_su(uint32_t a, |[source,c] ---- -int32_t doc_acc_4x8packed_us(uint32_t a, +int32_t dot_acc_4x8packed_us(uint32_t a, uint32_t b, int32_t c) ---- @@ -154,7 +154,7 @@ int32_t doc_acc_4x8packed_us(uint32_t a, |[source,c] ---- -uint32_t doc_acc_4x8packed_uu(uint32_t a, +uint32_t dot_acc_4x8packed_uu(uint32_t a, uint32_t b, uint32_t c); ---- @@ -209,7 +209,7 @@ int32_t dot_acc(int32_t a, |Performs a four-component integer dot product accumulate operation, where `a` and `b` are both interpreted as `vec`. Use -`doc_acc_4x8packed_ss` instead. +`dot_acc_4x8packed_ss` instead. |[source,c] ---- @@ -220,7 +220,7 @@ int32_t dot_acc(int32_t a, |Performs a four-component integer dot product accumulate operation, where `a` is interpreted as `vec` and `b` is interpreted as -`vec`. Use `doc_acc_4x8packed_su` instead. +`vec`. Use `dot_acc_4x8packed_su` instead. |[source,c] ---- @@ -231,7 +231,7 @@ int32_t dot_acc(uint32_t a, |Performs a four-component integer dot product accumulate operation, where `a` is interpreted as `vec` and `b` is interpreted as -`vec`. Use `doc_acc_4x8packed_us` instead. +`vec`. Use `dot_acc_4x8packed_us` instead. |[source,c] ---- @@ -242,5 +242,5 @@ int32_t dot_acc(uint32_t a, |Performs a four-component integer dot product accumulate operation, where `a` and `b` are both interpreted as `vec`. Use -`doc_acc_4x8packed_uu` instead. +`dot_acc_4x8packed_uu` instead. |==== From 37ae853ba3e20256f3f13a1c8325cc1a680bc14a Mon Sep 17 00:00:00 2001 From: Greg Lueck Date: Tue, 8 Oct 2024 11:55:43 -0400 Subject: [PATCH 4/4] Minor updates * Update to single-year copyright format * Update base SYCL 2020 revision to latest --- .../supported/sycl_ext_oneapi_dot_accumulate.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc index 45fa243844a3d..47ec21185b174 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_dot_accumulate.asciidoc @@ -20,7 +20,7 @@ == Notice [%hardbreaks] -Copyright (C) 2020-2023 Intel Corporation. All rights reserved. +Copyright (C) 2020 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -36,7 +36,7 @@ https://github.com/intel/llvm/issues == Dependencies -This extension is written against the SYCL 2020 revision 7 specification. All +This extension is written against the SYCL 2020 revision 9 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision.