From 2fe82e952a18ab87188b69b9480ebaeae4ed9408 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 1 Oct 2021 15:33:51 +0100
Subject: [PATCH 01/16] [SYCl][CUDA][MATRIX][DOC] Matrix tensorcore extension
 proposal

This proposal is based on the existing AMX proposal, making small adaptation to accomodate the Nvidia tensorcore hardware.
The intention is that the proposal should be compatible with both architectures (AMX and tensorcore).

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../dpcpp-joint-matrix-tensorcore.asciidoc    | 287 ++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
new file mode 100644
index 000000000000..78ad9d42d21c
--- /dev/null
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
@@ -0,0 +1,287 @@
+# (Nvidia Tensorcore) Matrix Programming Extension for DPC++: SYCL_EXT_ONEAPI_MATRIX=3
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+:dpcpp: pass:[DPC++]
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+Copyright (c) 2021-2021 Intel Corporation.  All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+This extension is written against the SYCL 2020 revision 3 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.  This extension builds on the existing AMX based matrix https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc[extension].
+
+
+**_NOTE:_** _This document describes the current design and API for the Nvidia tensorcore version of the matrix extension to {dpcpp}. This is an initial experimental version to try out functionality and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support for the matrix extension interface on Nvidia(R) Tensorcores. We are going to work with the community on incrementally improving the API to develop a single matrix interface that may be used for all backend architectures._
+
+## Introduction
+
+This document presents an ongoing work towards defining a unified matrix interface. This extension applies the existing experimental matrix extension (designed for the AMX architecture) to Nvidia tensorcore hardware, making small adaptations where necessary.
+
+**_NOTE:_** _Any necessary adaptations to the extension aim to ensure compatibility with a suitable AMX matrix implementation; any necessary adaptations to the existing AMX implementation resulting from changes introduced in this proposal should be small._
+
+The initial implementation of this extension uses Warp Matrix Multiply Accumulate https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-wmma[(wmma) PTX instructions] which can be generally used with Volta (sm_70, sm_72), Turing (sm_75), and Ampere (sm_80, sm_86) architecture generations.  These instructions are also expected to be forward compatible with future Nvidia generations.  A future implementation may additionally make use of Nvidia PTX mma instructions which are architecture generation specific, and may increase performance with respect to corresponding wmma instructions.  It is possible to implement mma ptx instructions without additional changes to this extension proposal.
+
+## Feature test macro
+
+This extension uses the existing feature-test macro used by the AMX matrix extension.  Feature test macros are described in the core SYCL
+specification section 6.3.3 "Feature test macros".  An
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_MATRIX` to one of the values defined in the table below.
+Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+[frame="none",options="header"]
+|======================
+|Value |Description
+|3     |Initial extension implementation on Nvidia Tensorcore.  Base features are supported.
+|======================
+
+## Currently implemented additions with respect to the AMX proposal
+
+### Matrix Type
+
+We introduce a new `matrix_type` enum which is necessary to distingush the correct low level PTX instruction for each operation.
+
+```c++
+namespace sycl::ext::intel::experimental::matrix {
+enum class matrix_type { a, b, accumulator };
+}
+```
+
+### Layout
+
+We adapt the Layout enum by including only a single `matrix_layout::packed` value. Different "packed" variations for A and B matrix types can be determined by the new `matrix_type` enum.
+
+**_NOTE:_** _The "packed" layout is only applicable to the AMX implementation: matrix_layout::packed is not required by the implementation of Nvidia wmma and mma instructions.  We suggest that the AMX matrix extension could consider replacing its usage of matrix_layout::packed_a and matrix_layout::packed_b with the single matrix_layout::packed, in conjunction with matrix_type::a and matrix_type::b introduced here._
+	
+```c++
+namespace sycl::ext::intel::experimental::matrix {
+enum class matrix_layout { row_major, col_major, packed };
+}
+```
+
+## Types, Shapes, and Layouts
+
+Unlike the AMX case, Nvidia Tensorcore architecture only supports a discrete set of matrix sizes that can form part of a Multiply Accumulate operation, and the supported matrix sizes depends on the data type of the matrix elements.
+
+MMA operations multiply matrices A (`matrix_type::a`) (M, K) and B (`matrix_type::b`) (K, N) and add the result to matrix C (`matrix_type::accumulator`) (M, N). The logical sizes are M, K, N.
+
+C = A*B + C 
+
+### Current Implementation Restrictions
+
+Currently only a single case: fp64 (M = N = 8, K = 4) is implemented:
+
+A(double, 8x4, row_major/col_major), B(double, 4x8, row_major/col_major), C(double, 8x8, row_major/col_major)
+
+In order to deal with different cases we use partial specialization of the various template functions introduced by the extension.  LLVM builtins are available for all possible matrix shapes, and runtime implementations covering these cases will be progressively added.
+
+### `joint_matrix` interface uses the new parameter, `matrix_type`, with respect to the AMX proposal
+
+We reuse the `joint_matrix` interface but add the new parameter, `matrix_type`. The user needs to additionally specify the type of the elements, shape, memory layout, and memory scope of the matrix. This results into the following description:
+
+```c++
+template <typename Group, typename T, matrix_type MT,
+          size_t Rows = sycl::dynamic_extent,
+          size_t Cols = sycl::dynamic_extent,
+          matrix_layout Layout = matrix_layout::row_major, typename Cond = void>
+struct joint_matrix {
+  joint_matrix(Group g) {}
+};
+```
+
+## Matrix Operations and their Execution Scope
+
+We define the three functions needed to perform the main and common operations on matrices namely, load, store, and the actual Multiply And Add operation. This set of functions can be easily extended if the Nvidia Tensorcore hardware implements new features.
+
+Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow.  As described in the AMX extension, `joint_matrix` is shared across a number of work-items that is hardware dependent.  For the case of Nvidia the number of work-items (CUDA threads) is equal to the warp size (32).
+
+For the CUDA backend the work-items that share a `joint_matrix` instance belong to the same sub-group. The group template parameter provided to `joint_matrix` is always a `sycl::sub_group`.  For example a column major matrix must be declared as follows:
+
+```c++
+joint_matrix<sub_group, T, K, N, matrix_layout::col_major> tB;
+```   
+
+where currently only `T = double` is supported.
+
+**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-groups. The requirement that kernels make use of the `sycl::reqd_sub_group_size` decorator is only for specific backends._
+
+To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
+
+### Load
+
+```c++
+template <typename Group, typename T, matrix_type MT, size_t NumRows,
+          size_t NumCols, matrix_layout Layout, access::address_space Space>
+void joint_matrix_load(
+    Group sg, joint_matrix<Group, T, MT, NumRows, NumCols, Layout> &res,
+    multi_ptr<T, Space> src, size_t stride) {
+  detail::joint_matrix_load_impl<Group, T, MT, NumRows, NumCols, Layout,
+                                 Space>{}
+      .load(res, src, stride);
+}
+```
+
+This function loads data from memory to the Nvidia matrix "fragments".
+
+The base pointer, `src`, determines the starting address of the sub-matrix to be loaded/stored. `layout` determines whether the data are being read/written with leading dimension `row_major` or `column_major`. `stride` describes the number of elements between consecutive rows for row major and packed layout, or columns for column major layout.
+
+IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and the layout in the store of matrix C must also be either `row_major` or `col_major`.
+
+**_NOTE:_** _The Layout argument has been removed with respect to the AMX extension in both `joint_matrix_load` and `joint_matrix_store`, since the Layout may be determined from the `joint_matrix`.  The addition of the `matrix_type` enumerator may also simplify the AMX implementation so that the Layout argument in `joint_matrix_load` and `joint_matrix_store` can be similarly removed for that case._
+
+The stride is currently passed to the wmma ptx instructions. The wmma ptx instruction then uses stride to pick the correct address for the current thread to load the correct fragment depending on the architecture.  When ptx mma instructions are used instead of the general wmma instructions, it becomes the responsibility of the implementation to provide the ptx mma instructions executed by each thread with the correct address to load fragments from.  The implementation can make use of `stride` to find the correct addresses.
+
+### Store
+
+```c++
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix_layout Layout, access::address_space Space>
+void joint_matrix_store(Group sg,
+                        joint_matrix<Group, T, matrix_type::accumulator,
+                                     NumRows, NumCols, Layout> &src,
+                        multi_ptr<T, Space> dst, size_t stride) {
+  detail::joint_matrix_store_impl<Group, T, NumRows, NumCols, Layout, Space>{}
+      .store(src, dst, stride);
+}
+```
+This function stores the data from the Nvidia matrix "fragments" back to memory.
+
+### Matrix fragments
+
+Fragments hold a set of matrix elements.  Each thread is responsible for a fragment of the matrix.  Depending on its usage, a fragment may hold a single row or column of a matrix, or a subset of a row or column.  The number of matrix elements held by each thread in a fragment depends on the matrix operation being executed.  For some matrix shapes/matrix element data types, matrix elements are packed into a larger data type within a fragment.  wmma ptx instructions pick the appropriate thread for each matrix fragment depending on the architecture generation used.
+
+As stated by the Nvidia PTX ISA:
+
+*"Each thread in the warp holds a fragment of the matrix. The distribution of fragments loaded by the threads in a warp is unspecified and is target architecture dependent, and hence the identity of the fragment within the matrix is also unspecified and is target architecture dependent."*
+
+In the hardware specific mma ptx instructions the distribution of fragments loaded by the threads in a warp is specified.  It is therefore the responsibility of the implementation to provide the correct address for the contiguous matrix elements corresponding to each fragment.
+
+### Multiply and Add
+
+```c++
+template <typename Group, typename T1, typename T2, std::size_t M,
+          std::size_t K, std::size_t N, matrix_layout LayoutA,
+          matrix_layout LayoutB, matrix_layout LayoutC>
+joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC>
+joint_matrix_mad(
+    Group sg, joint_matrix<Group, T1, matrix_type::a, M, K, LayoutA> A,
+    joint_matrix<Group, T1, matrix_type::b, K, N, LayoutB> B,
+    joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC> C) {
+  return detail::joint_matrix_mad_impl<Group, T1, T2, M, K, N, LayoutA, LayoutB,
+                                       LayoutC>{}
+      .mad(sg, A, B, C);
+}
+```
+The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulates the result with `C` and returns the result.
+
+## Concise example using double type and row_major matrices
+
+```c++
+using namespace sycl::ext::intel::experimental::matrix;
+
+cgh.parallel_for<class imatrix>(
+    nd_range<2>(GlobalRange,
+                LocalRange),
+    [=](nd_item<2> item){
+          sub_group sg = item.get_sub_group();
+          const auto m = item.get_group().get_id()[0]; // row id of current submatrix of BIG C matrix.
+          const auto n = item.get_group().get_id()[1]; // column id of current submatrix of BIG C matrix.
+          joint_matrix<sub_group, matrix_type::accumulator, M, N, matrix_layout::row_major> sub_c;
+          joint_matrix<sub_group, matrix_type::a, M, K, matrix_layout::row_major> sub_a;
+          joint_matrix<sub_group, matrix_type::b, K, N, matrix_layout::row_major> sub_b;
+          joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * BIG_N  + n * N, STRIDE_C);  
+          for (int k = 0; k < SUB_TILES_K; k += 1) {// row/col id of current submatrix of BIG A/B matrices.
+            joint_matrix_load(sg, sub_a, accA.get_pointer() + (k * K) + (m * M * BIG_K), STRIDE_A);
+	        joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * K * BIG_N) + (n * N), STRIDE_B);
+            sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);}
+          joint_matrix_store(sg, sub_c, accD.get_pointer() + (m * M) * BIG_N  + n * N, STRIDE_C);});});
+```
+
+## Implementation Status
+
+Currently, this is the compilation command line needed to invoke the extension on program "matrix-cuda.cpp":
+
+```c++
+clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 matrix-cuda.cpp -o output
+```
+**_NOTE:_** _--cuda-gpu-arch may be set lower than sm_80 depending on the required matrix operation and whether it is supported by the desired arch._
+
+## Future Implementation Work
+
+### Dealing with tf32 and bf16 matrix element types
+
+Alternative CUDA floating point types, bf16 and tf32, use the same number of bits for the exponent as fp32, so that these data types can cover the same range of numbers as float using lower precision.  For this reason a DPC++ programmer will be able to use these more efficient low precision data types in matrix operations by providing a matrix array consisting of fp32 elements as an argument to `joint_matrix_load` or `joint_matrix_store`.
+We will introduce a new enum, `matrix::precision`, that must be provided to the `joint_matrix` interface as an additional argument when the user desires bf16 or tf32 to be used as the A, B matrix element data type.  A future implementation will make use of the https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt[cvt PTX instruction] to cast the fp32 elements to either the tf32 or bf16 type.
+
+```c++
+namespace sycl::ext::intel::experimental::matrix {
+enum class precision
+{
+    tf32,
+    bf16
+};
+}
+```
+
+### Clarify USM compatibility
+
+multi_ptr can be constructed from T* since https://github.com/intel/llvm/pull/1183.  However currently this cannot be used with USM for all cases.
+It is expected that eventually the `joint_matrix_load` and `joint_matrix_store` interfaces will be fully compatible with USM.  Currently USM has only been validated to work with this extension for a single case: using shared USM pointers by casting them to the global address space in the following way:
+
+```c++
+joint_matrix_load(sg, sub_c, global_ptr<double>(d_C) + (m * M) * BIG_N + n * N, STRIDE_C);
+```
+
+Where d_C is a shared USM pointer, e.g.:
+
+```c++
+double*  d_C  = malloc_shared<double>(size, queue);
+```
+
+However even this case is not reliable and requires more testing.
+
+### Ensuring that non-portable cases provide intelligible error messages to users.
+
+This extension proposal is intended to be compatible with a hypothetical AMX implementation.  However this requirement necessitates the inclusion of `matrix_layout::packed` which is incompatible with the CUDA implementations of `joint_matrix`, `joint_matrix_load`, `joint_matrix_store`, and `joint_matrix_mad`.  Similar portability issues would occur in the other direction once cases dealing with the alternative CUDA floating point types, tf32 and bf16, are implemented.  In addition, more backends are expected to support the matrix extension in the future.  This means that a common means of reporting errors that result from users attempting to e.g. port code written for AMX using the packed format to CUDA, needs to be defined in a more mature version of the matrix extension.
+
+### Implementation of hardware generation specific mma ptx instructions
+
+It should be decided whether mma ptx instructions are to be a default optimization when available, or whether the dpc++ programmer should decide whether to use these potential optimizations.
+
+## TODO List
+
+- Add an implementation for matrix multiplication using the tf32 and bf16 types.
+- Add remaining shapes/data types for wmma instructions.
+- Verify that USM is fully compatible once a USM pointer can be generally correctly cast to multi_ptr.
+- Work out and maintain a common interface with AMX (and other archs).
+- Optimize for specific Nvidia hardware using mma ptx instructions.
+
+## Revision History
+
+[frame="none",options="header"]
+|======================
+|Rev |Date       |Author     |Changes
+|1   | |Jack Kirk |Initial public working draft.
+|======================

From 5baff2dc40912ab044a823a11d5647ae64719e16 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 8 Nov 2021 14:33:55 +0000
Subject: [PATCH 02/16] Added matrix_use to proposal.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../dpcpp-joint-matrix-tensorcore.asciidoc    | 10 +--
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 83 ++++++++++++-------
 2 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
index 78ad9d42d21c..ffd8d16a052b 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
@@ -59,9 +59,9 @@ value to determine which of the extension's APIs the implementation supports.
 
 ## Currently implemented additions with respect to the AMX proposal
 
-### Matrix Type
+### Matrix Use
 
-We introduce a new `matrix_type` enum which is necessary to distingush the correct low level PTX instruction for each operation.
+We introduce a new `matrix_use` enum which is necessary to distingush the correct low level PTX instruction for each operation.
 
 ```c++
 namespace sycl::ext::intel::experimental::matrix {
@@ -120,7 +120,7 @@ Since the matrix functions are group operations (as defined in Section 4.17.3 of
 For the CUDA backend the work-items that share a `joint_matrix` instance belong to the same sub-group. The group template parameter provided to `joint_matrix` is always a `sycl::sub_group`.  For example a column major matrix must be declared as follows:
 
 ```c++
-joint_matrix<sub_group, T, K, N, matrix_layout::col_major> tB;
+joint_matrix<T, matrix_use::b, K, N, matrix_layout::col_major> tB;
 ```   
 
 where currently only `T = double` is supported.
@@ -233,11 +233,11 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-g
 ### Dealing with tf32 and bf16 matrix element types
 
 Alternative CUDA floating point types, bf16 and tf32, use the same number of bits for the exponent as fp32, so that these data types can cover the same range of numbers as float using lower precision.  For this reason a DPC++ programmer will be able to use these more efficient low precision data types in matrix operations by providing a matrix array consisting of fp32 elements as an argument to `joint_matrix_load` or `joint_matrix_store`.
-We will introduce a new enum, `matrix::precision`, that must be provided to the `joint_matrix` interface as an additional argument when the user desires bf16 or tf32 to be used as the A, B matrix element data type.  A future implementation will make use of the https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt[cvt PTX instruction] to cast the fp32 elements to either the tf32 or bf16 type.
+We will introduce a new enum, `matrix::matrix_type`, that must be provided to the `joint_matrix` interface as an additional argument when the user desires bf16 or tf32 to be used as the A, B matrix element data type.  A future implementation will make use of the https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt[cvt PTX instruction] to cast the fp32 elements to either the tf32 or bf16 type.
 
 ```c++
 namespace sycl::ext::intel::experimental::matrix {
-enum class precision
+enum class matrix_type
 {
     tf32,
     bf16
diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index f3b96a8827ea..6a0a97394feb 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -33,11 +33,11 @@ SYCL specification refer to that revision.
 
 **_NOTE:_** _This document describes the current design and API for the matrix
 extension to {dpcpp}. This is an initial experimental version to try out functionality
-and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX) and DPAS. We are going to work with the community on incrementally improving
+and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX), DPAS, and Nvidia Tensorcore. We are going to work with the community on incrementally improving
 the API to bring them closer to standard C++ (aligned with the `std::mdspan` and `std::mdarray` proposals) and SYCL in the next several months._
 
 ## Introduction
-This document presents an ongoing work towards defining a unified matrix interface. This interface is intended to unify different tensor hardware: Intel AMX in CPUs, Habana Gaudi and Goya tensor and gemm cores, Nvidia TPUs, IBM Power MMA. All these hardware provide low-level intrinsics or assembly to access and perform matrix operations. The goal is to provide a unified interface that is portable but also benefit from the maximum performance these different hardware can offer.
+This document presents an ongoing work towards defining a unified matrix interface. This interface is intended to unify different tensor hardware: Intel AMX in CPUs, Habana Gaudi and Goya tensor and gemm cores, Nvidia TPUs, IBM Power MMA. All these hardware provide low-level intrinsics or assembly to access and perform matrix operations. The goal is to provide a unified interface that is portable but also benefits from the maximum performance these different hardware can offer.
 
 ## Feature test macro
 
@@ -54,34 +54,46 @@ value to determine which of the extension's APIs the implementation supports.
 |Value |Description
 |1     |Initial extension implementation on Intel AMX.  Base features are supported.
 |2     |Initial extension JIT implementation on Intel AMX and DPAS. load, store, mad and the query interface are supported 
+|3     |Initial extension implementation on Nvidia Tensorcore. Base features are supported.
 |======================
 
 ## New `joint_matrix` class
-We introduce a new class called `joint_matrix`. The user needs to specify the type of the elements, shape, the memory layout, and the memory scope of the matrix. This results into the following description:
+We introduce a new class called `joint_matrix`. The user needs to specify the type of the elements, matrix_use (a, b, accumulator), shape, the memory layout, and the memory scope of the matrix. This results into the following description:
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
-template <typename T, size_t Rows=sycl::dynamic_extent, size_t Cols=sycl::dynamic_extent, 
-          matrix_layout Layout = matrix_layout::row_major, typename Group = sub_group>
+template <typename T, matrix_use MT,
+          size_t Rows = sycl::dynamic_extent,
+          size_t Cols = sycl::dynamic_extent,
+          matrix_layout Layout = matrix_layout::row_major,
+          typename Group, typename Cond = void>
 struct joint_matrix {
-    joint_matrix(Group g) {}
+  joint_matrix(Group g) {}
 };
 }
 ```
 
+The final optional conditional argument can be used to remove cases for template parameter values which are incompatible with a particular backend.
+For example, the Nvidia Tensorcore backend does not allow usage of `matrix_layout::packed_a` or `matrix_layout::packed_b`.
+
 
 #### Memory Scope
-In this experimental API version, we used the terminology of `joint_matrix` instead of plain `matrix` to emphasis that the matrix is shared among a group of work items and is not private to each work item. The memory scope is added as an additional template parameter and is also part of the constructor arguments.
+In this experimental API version, we used the terminology of `joint_matrix` instead of plain `matrix` to emphasize that the matrix is shared among a group of work items and is not private to each work item. The memory scope is added as an additional template parameter and is also part of the constructor arguments.
 
-IMPORTANT: In the current implementation, only the subgroup scope is supported
+IMPORTANT: In the current implementation, only the subgroup scope is supported.  For Nvidia Tensorcore only the subgroup scope is supported.
 
 When the group is a `sycl::sub_group`, a matrix is declared as follows:
 
 ```c++
-joint_matrix<int8_t, tM, tN> tA(sg); 
+joint_matrix<double, matrix_use::accumulator, M, N, matrix_layout::col_major, sub_group> tC(sg);
 ```
 
 #### Shape
+
+MMA operations multiply matrices A (matrix_use::a) (M, K) and B (matrix_use::b) (K, N) and add the result to matrix C (matrix_use::accumulator) (M, N). The logical sizes are M, K, N.
+
+C = A * B + C
+
 The same class `joint_matrix` should handle both cases where sizes are constant (GPU case) and when sizes are variables (CPU case). Note that a Intel AMX 2d tile register permits sizes up to 1024 (16rowsx64cols) bytes. The ability to define only one interface for both makes it possible to give the user a way to make use of the flexibility introduced by the CPU but at the same time save resources on the GPU. We use `sycl::dynamic_extent`  to differentiate between static and dynamic sizes.
 
 IMPORTANT: In the current implementation, only the static extent is supported
@@ -104,10 +116,11 @@ enum class matrix_layout {
 Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed layout. If we multiply matrices A (M, K) and B (K, N) into a matrix C (M, N). The logical sizes are M, K, N. However, the packed shape for B tile uses the VNNI format, which is described below. The user must provide the information of packed_b layout to make the implementation allocate the right shape. The layout information for Intel AMX should be specified in user code as follows: 
 
 ```c++
-joint_matrix<int8_t, K, N, packed_b> tB(sg);
+joint_matrix<int8_t, matrix_use::b, K, N, packed_b> tB(sg);
 ```   
 IMPORTANT: In the current implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
 
+**_NOTE:_** _The "packed" layout is only applicable to the AMX implementation: matrix_layout::packed is not required by the implementation of Nvidia wmma and mma instructions.  We suggest that the AMX matrix extension could consider replacing its usage of matrix_layout::packed_a and matrix_layout::packed_b with the single matrix_layout::packed, in conjunction with matrix_type::a and matrix_type::b introduced here._
 
 
 ## Matrix Operations and their Execution Scope
@@ -121,6 +134,8 @@ IMPORTANT: In the current implementation, the layout in the load of matrix B mus
 
 Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow. The `Group` template argument can be a work-group or a subgroup. These functions will be called once by each work item in the group.
 
+**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-groups. The requirement that kernels make use of the `sycl::reqd_sub_group_size` decorator is only for specific backends._
+
 To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
 
 IMPORTANT: In the current implementation, only the subgroup scope is supported.  
@@ -128,38 +143,43 @@ IMPORTANT: In the current implementation, only the subgroup scope is supported.
 #### Load 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
-  template <typename Group, typename T, size_t NumRows, size_t NumCols,
-          matrix_layout Layout,
-          access::address_space Space>
-  void joint_matrix_load(Group sg, joint_matrix<T, NumRows, NumCols, Layout, Group> &res,
-		    multi_ptr<T, Space> src, size_t stride, matrix_layout MemLayout);
+template <typename Group, typename T, matrix_use MT, size_t NumRows,
+          size_t NumCols, matrix_layout Layout, access::address_space Space>
+void joint_matrix_load(
+    Group sg, joint_matrix<T, MT, NumRows, NumCols, Layout, Group> &res,
+    multi_ptr<T, Space> src, size_t stride);
 }
 ```
-This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS.
+This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS or to the matrix "fragments" for Nvidia Tensorcore.  Note that `Layout` is not included as an argument since it may be determined from the joint_matrix argument.
+
+IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`.
 
 
 #### Store 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
-  template <typename Group, typename T, size_t NumRows, size_t NumCols,
-          matrix_layout L,
-          access::address_space Space>	  
-  void joint_matrix_store(Group sg, joint_matrix<T, NumRows, NumCols, L, Group> &res,
-		     multi_ptr<T, Space> src, size_t stride, matrix_layout memL);
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix_layout Layout, access::address_space Space>
+void joint_matrix_store(Group sg,
+                        joint_matrix<T, matrix_use::accumulator, NumRows,
+                                     NumCols, Layout, Group> &src,
+                        multi_ptr<T, Space> dst, size_t stride);
 }
 ```
-This function stores the data from the 2d tiles back to memory.
+This function stores the data from the 2d tiles/"fragments" back to memory.  This function is only available for matrix_use::accumulator.
 
 #### Multiply and Add
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
-  template <typename Group, typename Ta, typename Tb, typename Tc,
-          std::size_t M, std::size_t K, std::size_t N,
-	  matrix_layout La, matrix_layout Lb,
-          matrix_layout Lc>
-  joint_matrix<Group, Tc, M, N, Lc> joint_matrix_mad(Group sg, joint_matrix<Ta, M, K, La, Group> A,
-               joint_matrix<Tb, K, N, Lb, Group> B, joint_matrix<Tc, M, N, Lc, Group> C);
+template <typename Group, typename T1, typename T2, std::size_t M,
+          std::size_t K, std::size_t N, matrix_layout LayoutA,
+          matrix_layout LayoutB, matrix_layout LayoutC>
+joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
+joint_matrix_mad(
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group> B,
+    joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C);
 }
 ```
 The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulate the result with `C` and return the result.
@@ -583,13 +603,15 @@ We did not utilize this extension for this matrix API version because sub-group
 - Ronan Keryell: "It would be interesting to investigate whether providing also member functions would simplify the API. Provide both so it is possible to use the best one for each use case, while waiting for https://en.wikipedia.org/wiki/Uniform_Function_Call_Syntax to land into C++?"
 
 - In the future looking APIs, `get_wi_slice` (that is currently under design) returns an owned object. Should this return a view object to make sure the original matrix C is changed after its slices are modified.
-
+- multi_ptr can be constructed from T* since https://github.com/intel/llvm/pull/1183.  However currently this cannot be used with USM for all cases.
+  It is expected that eventually the `joint_matrix_load` and `joint_matrix_store` interfaces will be fully compatible with USM.
 ## TODO List
 - Add support for fill matrix and element-wise operations features
-- Add 'matrix_use' parameter to the matrix to distinguish between matrix A, B, and matrix accumulator. This is necessary for supporting VNNI and transpose transform 
 - Change the names default sizes in the query from defaultM, defaultN, defaultK to M,N,K
 - Change the type of `scope` in the query interface to be able to return more than one value. This will be useful in the event we support other scopes like workgroup besides subgroups
 - Add a more realistic and complete example that shows the value of the general query
+- Clarify USM compatibility
+- Add Nvidia case to the query interface, and consider how the future looking API can be implemented for Nvidia.
 
 
 ## Revision History
@@ -599,4 +621,5 @@ We did not utilize this extension for this matrix API version because sub-group
 |Rev |Date       |Author     |Changes
 |1   |2021-04-13 |Dounia Khaldi |Initial public working draft.
 |2   |2021-10-05 |Dounia Khaldi |JIT implementation on both Intel AMX and DPAS
+|3   |2021-11-08 |Jack Kirk |Added matrix_use
 |======================

From 677ba9dfddfd4954ed2590a1dce888586f535e56 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 9 Nov 2021 10:17:04 +0000
Subject: [PATCH 03/16] Added impl status table for nvidia.

Added nvidia compilation instructions.
Other clarifications added on distinction between Nvidia and AMX use cases.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 30 ++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 6a0a97394feb..ba77171cb4ed 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -113,14 +113,14 @@ enum class matrix_layout {
 }
 ```
 
-Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed layout. If we multiply matrices A (M, K) and B (K, N) into a matrix C (M, N). The logical sizes are M, K, N. However, the packed shape for B tile uses the VNNI format, which is described below. The user must provide the information of packed_b layout to make the implementation allocate the right shape. The layout information for Intel AMX should be specified in user code as follows: 
+Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed layout.  However, the packed shape for B tile uses the VNNI format, which is described below. The user must provide the information of packed_b layout to make the implementation allocate the right shape. The layout information for Intel AMX should be specified in user code as follows:
 
 ```c++
 joint_matrix<int8_t, matrix_use::b, K, N, packed_b> tB(sg);
 ```   
-IMPORTANT: In the current implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
+IMPORTANT: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
 
-**_NOTE:_** _The "packed" layout is only applicable to the AMX implementation: matrix_layout::packed is not required by the implementation of Nvidia wmma and mma instructions.  We suggest that the AMX matrix extension could consider replacing its usage of matrix_layout::packed_a and matrix_layout::packed_b with the single matrix_layout::packed, in conjunction with matrix_type::a and matrix_type::b introduced here._
+**_NOTE:_** _The "packed" layouts are only applicable to the AMX implementation: matrix_layout::packed_a and matrix_layout::packed_b are not required by the implementation of Nvidia wmma and mma instructions._
 
 
 ## Matrix Operations and their Execution Scope
@@ -130,7 +130,7 @@ The base pointer determines the starting address of the matrix to be loaded/stor
 
 Note that for getting maximum performance on Intel AMX and DPAS, prepacking data in the memory is necessary. If users did not specify the packed layouts (`packed_a` when matrix `C` is column major, `packed_b` when matrix `C` is row major), transforms done by the implementation will be slow due to extra scatter/gather operations. Hence, we expose these layouts `packed_a` and `packed_b` to the user to specify that A or B have already been VNNIed. The packed or VNNI layout is introduced in `VNNI layout` section below.
 	
-IMPORTANT: In the current implementation, the layout in the load of matrix B must be `packed_b`.  Therefore, both the template parameter for the declaration of the B matrix and the call to `joint_matrix_load` for the B matrix must specify the `packed_b` layout.  The layout in the load of matrices A and C must be `row_major`, and the layout in the store of matrix C must also be `row_major`.
+IMPORTANT: In the current AMX and DPAS implementation, the layout in the load of matrix B must be `packed_b`.  Therefore, both the template parameter for the declaration of the B matrix and the call to `joint_matrix_load` for the B matrix must specify the `packed_b` layout.  The layout in the load of matrices A and C must be `row_major`, and the layout in the store of matrix C must also be `row_major`.
 
 Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow. The `Group` template argument can be a work-group or a subgroup. These functions will be called once by each work item in the group.
 
@@ -255,6 +255,28 @@ q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item)
 }).wait();
 ```
 
+## Nvidia Compilation instructions
+
+When compiling for Nvidia the matrix extension requires specification of the architecture version.
+This is the compilation command line needed to invoke the matrix extension on program "matrix-nvidia.cpp":
+
+```c++
+clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 matrix-nvidia.cpp -o output
+```
+**_NOTE:_** _--cuda-gpu-arch may be set lower than sm_80 depending on the required matrix operation and whether it is supported by the desired arch._
+
+### Current Nvidia Implementation status
+
+Currently only a single case, fp64, is implemented.  All other available data types/sizes will be added shortly.
+
+[frame="none",options="header"]
+|======================
+|data type |M       |N     |K | required SM version
+|double (fp64)   |8 |8 |4| sm_80
+
+
+|======================
+
 == Query Interface
 Intel AMX, DPAS and Nvidia TPUs support different sizes and types. 
 The query interface is used to validate user code and inform them about supported types, sizes, scope, and layouts by the implementation.

From 33978b9de916ae751203d98d4457106d4ce667e0 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 9 Nov 2021 11:03:16 +0000
Subject: [PATCH 04/16] Improved presentation.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 38 +++++++++----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index ba77171cb4ed..84eec4bed530 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -82,7 +82,7 @@ In this experimental API version, we used the terminology of `joint_matrix` inst
 
 IMPORTANT: In the current implementation, only the subgroup scope is supported.  For Nvidia Tensorcore only the subgroup scope is supported.
 
-When the group is a `sycl::sub_group`, a matrix is declared as follows:
+When the group is a `sycl::sub_group`, an example matrix declaration is as follows:
 
 ```c++
 joint_matrix<double, matrix_use::accumulator, M, N, matrix_layout::col_major, sub_group> tC(sg);
@@ -98,9 +98,22 @@ The same class `joint_matrix` should handle both cases where sizes are constant
 
 IMPORTANT: In the current implementation, only the static extent is supported
 
+**_NOTE:_** _Nvidia tensorcore instructions only support a discrete set of matrix shapes. The supported matrix shapes depend upon the data types of the "a", "b", and "accumulator" matrices involved in the MMA operation._
+
+##### Current Nvidia Implementation status
+
+Currently only a single case, where all matrices consist of elements of double (fp64) type, is implemented.  All other available matrix types/shapes will be added shortly.
+
+[frame="none",options="header"]
+|======================
+|"a" data type |"b" data type |"accumulator" data type |M       |N     |K | required SM version
+|double |double |double   |8 |8 |4| sm_80
+
+
+|======================
 
 #### Layout
-Besides row major and column major layouts, `matrix_layout` is flexible enough to introduce customed layouts such as symmetric or tiled layouts.
+Besides row major and column major layouts, `matrix_layout` is flexible enough to introduce custom layouts such as symmetric or tiled layouts.
 	
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -120,7 +133,7 @@ joint_matrix<int8_t, matrix_use::b, K, N, packed_b> tB(sg);
 ```   
 IMPORTANT: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
 
-**_NOTE:_** _The "packed" layouts are only applicable to the AMX implementation: matrix_layout::packed_a and matrix_layout::packed_b are not required by the implementation of Nvidia wmma and mma instructions._
+IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`. The "packed" layouts are only applicable to the AMX implementation: matrix_layout::packed_a and matrix_layout::packed_b are not required by the implementation of Nvidia wmma and mma instructions.
 
 
 ## Matrix Operations and their Execution Scope
@@ -134,7 +147,7 @@ IMPORTANT: In the current AMX and DPAS implementation, the layout in the load of
 
 Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow. The `Group` template argument can be a work-group or a subgroup. These functions will be called once by each work item in the group.
 
-**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-groups. The requirement that kernels make use of the `sycl::reqd_sub_group_size` decorator is only for specific backends._
+**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-group, and also the default DPC++ sub-group size for the CUDA backend._
 
 To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
 
@@ -152,9 +165,6 @@ void joint_matrix_load(
 ```
 This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS or to the matrix "fragments" for Nvidia Tensorcore.  Note that `Layout` is not included as an argument since it may be determined from the joint_matrix argument.
 
-IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`.
-
-
 #### Store 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -265,18 +275,6 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-g
 ```
 **_NOTE:_** _--cuda-gpu-arch may be set lower than sm_80 depending on the required matrix operation and whether it is supported by the desired arch._
 
-### Current Nvidia Implementation status
-
-Currently only a single case, fp64, is implemented.  All other available data types/sizes will be added shortly.
-
-[frame="none",options="header"]
-|======================
-|data type |M       |N     |K | required SM version
-|double (fp64)   |8 |8 |4| sm_80
-
-
-|======================
-
 == Query Interface
 Intel AMX, DPAS and Nvidia TPUs support different sizes and types. 
 The query interface is used to validate user code and inform them about supported types, sizes, scope, and layouts by the implementation.
@@ -318,8 +316,6 @@ The table below provides a description for each of the member variables and type
 
 
 
-
-
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
 

From 71c1b7eee5a29e889b330fcfee0b3c021d6f235e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 9 Nov 2021 11:12:50 +0000
Subject: [PATCH 05/16] Some further small style changes.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 84eec4bed530..8c97a9edc9f5 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -98,7 +98,7 @@ The same class `joint_matrix` should handle both cases where sizes are constant
 
 IMPORTANT: In the current implementation, only the static extent is supported
 
-**_NOTE:_** _Nvidia tensorcore instructions only support a discrete set of matrix shapes. The supported matrix shapes depend upon the data types of the "a", "b", and "accumulator" matrices involved in the MMA operation._
+**_NOTE:_** _Nvidia tensorcore instructions only support a discrete set of matrix shapes. The supported matrix shapes correspond with particular data types of the "a", "b", and "accumulator" matrices involved in the MMA operation._
 
 ##### Current Nvidia Implementation status
 
@@ -192,7 +192,7 @@ joint_matrix_mad(
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C);
 }
 ```
-The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulate the result with `C` and return the result.
+The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulates the result with `C` and returns the result.
 
 
 ## VNNI/Packed Layout

From b5b03f32bb1c84aebeb187e02645f456ea223412 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 9 Nov 2021 11:14:49 +0000
Subject: [PATCH 06/16] Removed tensorcore specific proposal.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../dpcpp-joint-matrix-tensorcore.asciidoc    | 287 ------------------
 1 file changed, 287 deletions(-)
 delete mode 100644 sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
deleted file mode 100644
index ffd8d16a052b..000000000000
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix-tensorcore.asciidoc
+++ /dev/null
@@ -1,287 +0,0 @@
-# (Nvidia Tensorcore) Matrix Programming Extension for DPC++: SYCL_EXT_ONEAPI_MATRIX=3
-:source-highlighter: coderay
-:coderay-linenums-mode: table
-:dpcpp: pass:[DPC++]
-
-// This section needs to be after the document title.
-:doctype: book
-:toc2:
-:toc: left
-:encoding: utf-8
-:lang: en
-
-:blank: pass:[ +]
-
-// Set the default source code type in this document to C++,
-// for syntax highlighting purposes.  This is needed because
-// docbook uses c++ and html5 uses cpp.
-:language: {basebackend@docbook:c++:cpp}
-
-
-== Notice
-
-Copyright (c) 2021-2021 Intel Corporation.  All rights reserved.
-
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
-
-This extension is written against the SYCL 2020 revision 3 specification.  All
-references below to the "core SYCL specification" or to section numbers in the
-SYCL specification refer to that revision.  This extension builds on the existing AMX based matrix https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc[extension].
-
-
-**_NOTE:_** _This document describes the current design and API for the Nvidia tensorcore version of the matrix extension to {dpcpp}. This is an initial experimental version to try out functionality and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support for the matrix extension interface on Nvidia(R) Tensorcores. We are going to work with the community on incrementally improving the API to develop a single matrix interface that may be used for all backend architectures._
-
-## Introduction
-
-This document presents an ongoing work towards defining a unified matrix interface. This extension applies the existing experimental matrix extension (designed for the AMX architecture) to Nvidia tensorcore hardware, making small adaptations where necessary.
-
-**_NOTE:_** _Any necessary adaptations to the extension aim to ensure compatibility with a suitable AMX matrix implementation; any necessary adaptations to the existing AMX implementation resulting from changes introduced in this proposal should be small._
-
-The initial implementation of this extension uses Warp Matrix Multiply Accumulate https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-wmma[(wmma) PTX instructions] which can be generally used with Volta (sm_70, sm_72), Turing (sm_75), and Ampere (sm_80, sm_86) architecture generations.  These instructions are also expected to be forward compatible with future Nvidia generations.  A future implementation may additionally make use of Nvidia PTX mma instructions which are architecture generation specific, and may increase performance with respect to corresponding wmma instructions.  It is possible to implement mma ptx instructions without additional changes to this extension proposal.
-
-## Feature test macro
-
-This extension uses the existing feature-test macro used by the AMX matrix extension.  Feature test macros are described in the core SYCL
-specification section 6.3.3 "Feature test macros".  An
-implementation supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_MATRIX` to one of the values defined in the table below.
-Applications can test for the existence of this macro to determine if the
-implementation supports this feature, or applications can test the macro's
-value to determine which of the extension's APIs the implementation supports.
-
-[frame="none",options="header"]
-|======================
-|Value |Description
-|3     |Initial extension implementation on Nvidia Tensorcore.  Base features are supported.
-|======================
-
-## Currently implemented additions with respect to the AMX proposal
-
-### Matrix Use
-
-We introduce a new `matrix_use` enum which is necessary to distingush the correct low level PTX instruction for each operation.
-
-```c++
-namespace sycl::ext::intel::experimental::matrix {
-enum class matrix_type { a, b, accumulator };
-}
-```
-
-### Layout
-
-We adapt the Layout enum by including only a single `matrix_layout::packed` value. Different "packed" variations for A and B matrix types can be determined by the new `matrix_type` enum.
-
-**_NOTE:_** _The "packed" layout is only applicable to the AMX implementation: matrix_layout::packed is not required by the implementation of Nvidia wmma and mma instructions.  We suggest that the AMX matrix extension could consider replacing its usage of matrix_layout::packed_a and matrix_layout::packed_b with the single matrix_layout::packed, in conjunction with matrix_type::a and matrix_type::b introduced here._
-	
-```c++
-namespace sycl::ext::intel::experimental::matrix {
-enum class matrix_layout { row_major, col_major, packed };
-}
-```
-
-## Types, Shapes, and Layouts
-
-Unlike the AMX case, Nvidia Tensorcore architecture only supports a discrete set of matrix sizes that can form part of a Multiply Accumulate operation, and the supported matrix sizes depends on the data type of the matrix elements.
-
-MMA operations multiply matrices A (`matrix_type::a`) (M, K) and B (`matrix_type::b`) (K, N) and add the result to matrix C (`matrix_type::accumulator`) (M, N). The logical sizes are M, K, N.
-
-C = A*B + C 
-
-### Current Implementation Restrictions
-
-Currently only a single case: fp64 (M = N = 8, K = 4) is implemented:
-
-A(double, 8x4, row_major/col_major), B(double, 4x8, row_major/col_major), C(double, 8x8, row_major/col_major)
-
-In order to deal with different cases we use partial specialization of the various template functions introduced by the extension.  LLVM builtins are available for all possible matrix shapes, and runtime implementations covering these cases will be progressively added.
-
-### `joint_matrix` interface uses the new parameter, `matrix_type`, with respect to the AMX proposal
-
-We reuse the `joint_matrix` interface but add the new parameter, `matrix_type`. The user needs to additionally specify the type of the elements, shape, memory layout, and memory scope of the matrix. This results into the following description:
-
-```c++
-template <typename Group, typename T, matrix_type MT,
-          size_t Rows = sycl::dynamic_extent,
-          size_t Cols = sycl::dynamic_extent,
-          matrix_layout Layout = matrix_layout::row_major, typename Cond = void>
-struct joint_matrix {
-  joint_matrix(Group g) {}
-};
-```
-
-## Matrix Operations and their Execution Scope
-
-We define the three functions needed to perform the main and common operations on matrices namely, load, store, and the actual Multiply And Add operation. This set of functions can be easily extended if the Nvidia Tensorcore hardware implements new features.
-
-Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow.  As described in the AMX extension, `joint_matrix` is shared across a number of work-items that is hardware dependent.  For the case of Nvidia the number of work-items (CUDA threads) is equal to the warp size (32).
-
-For the CUDA backend the work-items that share a `joint_matrix` instance belong to the same sub-group. The group template parameter provided to `joint_matrix` is always a `sycl::sub_group`.  For example a column major matrix must be declared as follows:
-
-```c++
-joint_matrix<T, matrix_use::b, K, N, matrix_layout::col_major> tB;
-```   
-
-where currently only `T = double` is supported.
-
-**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-groups. The requirement that kernels make use of the `sycl::reqd_sub_group_size` decorator is only for specific backends._
-
-To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
-
-### Load
-
-```c++
-template <typename Group, typename T, matrix_type MT, size_t NumRows,
-          size_t NumCols, matrix_layout Layout, access::address_space Space>
-void joint_matrix_load(
-    Group sg, joint_matrix<Group, T, MT, NumRows, NumCols, Layout> &res,
-    multi_ptr<T, Space> src, size_t stride) {
-  detail::joint_matrix_load_impl<Group, T, MT, NumRows, NumCols, Layout,
-                                 Space>{}
-      .load(res, src, stride);
-}
-```
-
-This function loads data from memory to the Nvidia matrix "fragments".
-
-The base pointer, `src`, determines the starting address of the sub-matrix to be loaded/stored. `layout` determines whether the data are being read/written with leading dimension `row_major` or `column_major`. `stride` describes the number of elements between consecutive rows for row major and packed layout, or columns for column major layout.
-
-IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and the layout in the store of matrix C must also be either `row_major` or `col_major`.
-
-**_NOTE:_** _The Layout argument has been removed with respect to the AMX extension in both `joint_matrix_load` and `joint_matrix_store`, since the Layout may be determined from the `joint_matrix`.  The addition of the `matrix_type` enumerator may also simplify the AMX implementation so that the Layout argument in `joint_matrix_load` and `joint_matrix_store` can be similarly removed for that case._
-
-The stride is currently passed to the wmma ptx instructions. The wmma ptx instruction then uses stride to pick the correct address for the current thread to load the correct fragment depending on the architecture.  When ptx mma instructions are used instead of the general wmma instructions, it becomes the responsibility of the implementation to provide the ptx mma instructions executed by each thread with the correct address to load fragments from.  The implementation can make use of `stride` to find the correct addresses.
-
-### Store
-
-```c++
-template <typename Group, typename T, size_t NumRows, size_t NumCols,
-          matrix_layout Layout, access::address_space Space>
-void joint_matrix_store(Group sg,
-                        joint_matrix<Group, T, matrix_type::accumulator,
-                                     NumRows, NumCols, Layout> &src,
-                        multi_ptr<T, Space> dst, size_t stride) {
-  detail::joint_matrix_store_impl<Group, T, NumRows, NumCols, Layout, Space>{}
-      .store(src, dst, stride);
-}
-```
-This function stores the data from the Nvidia matrix "fragments" back to memory.
-
-### Matrix fragments
-
-Fragments hold a set of matrix elements.  Each thread is responsible for a fragment of the matrix.  Depending on its usage, a fragment may hold a single row or column of a matrix, or a subset of a row or column.  The number of matrix elements held by each thread in a fragment depends on the matrix operation being executed.  For some matrix shapes/matrix element data types, matrix elements are packed into a larger data type within a fragment.  wmma ptx instructions pick the appropriate thread for each matrix fragment depending on the architecture generation used.
-
-As stated by the Nvidia PTX ISA:
-
-*"Each thread in the warp holds a fragment of the matrix. The distribution of fragments loaded by the threads in a warp is unspecified and is target architecture dependent, and hence the identity of the fragment within the matrix is also unspecified and is target architecture dependent."*
-
-In the hardware specific mma ptx instructions the distribution of fragments loaded by the threads in a warp is specified.  It is therefore the responsibility of the implementation to provide the correct address for the contiguous matrix elements corresponding to each fragment.
-
-### Multiply and Add
-
-```c++
-template <typename Group, typename T1, typename T2, std::size_t M,
-          std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC>
-joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC>
-joint_matrix_mad(
-    Group sg, joint_matrix<Group, T1, matrix_type::a, M, K, LayoutA> A,
-    joint_matrix<Group, T1, matrix_type::b, K, N, LayoutB> B,
-    joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC> C) {
-  return detail::joint_matrix_mad_impl<Group, T1, T2, M, K, N, LayoutA, LayoutB,
-                                       LayoutC>{}
-      .mad(sg, A, B, C);
-}
-```
-The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulates the result with `C` and returns the result.
-
-## Concise example using double type and row_major matrices
-
-```c++
-using namespace sycl::ext::intel::experimental::matrix;
-
-cgh.parallel_for<class imatrix>(
-    nd_range<2>(GlobalRange,
-                LocalRange),
-    [=](nd_item<2> item){
-          sub_group sg = item.get_sub_group();
-          const auto m = item.get_group().get_id()[0]; // row id of current submatrix of BIG C matrix.
-          const auto n = item.get_group().get_id()[1]; // column id of current submatrix of BIG C matrix.
-          joint_matrix<sub_group, matrix_type::accumulator, M, N, matrix_layout::row_major> sub_c;
-          joint_matrix<sub_group, matrix_type::a, M, K, matrix_layout::row_major> sub_a;
-          joint_matrix<sub_group, matrix_type::b, K, N, matrix_layout::row_major> sub_b;
-          joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * BIG_N  + n * N, STRIDE_C);  
-          for (int k = 0; k < SUB_TILES_K; k += 1) {// row/col id of current submatrix of BIG A/B matrices.
-            joint_matrix_load(sg, sub_a, accA.get_pointer() + (k * K) + (m * M * BIG_K), STRIDE_A);
-	        joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * K * BIG_N) + (n * N), STRIDE_B);
-            sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);}
-          joint_matrix_store(sg, sub_c, accD.get_pointer() + (m * M) * BIG_N  + n * N, STRIDE_C);});});
-```
-
-## Implementation Status
-
-Currently, this is the compilation command line needed to invoke the extension on program "matrix-cuda.cpp":
-
-```c++
-clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 matrix-cuda.cpp -o output
-```
-**_NOTE:_** _--cuda-gpu-arch may be set lower than sm_80 depending on the required matrix operation and whether it is supported by the desired arch._
-
-## Future Implementation Work
-
-### Dealing with tf32 and bf16 matrix element types
-
-Alternative CUDA floating point types, bf16 and tf32, use the same number of bits for the exponent as fp32, so that these data types can cover the same range of numbers as float using lower precision.  For this reason a DPC++ programmer will be able to use these more efficient low precision data types in matrix operations by providing a matrix array consisting of fp32 elements as an argument to `joint_matrix_load` or `joint_matrix_store`.
-We will introduce a new enum, `matrix::matrix_type`, that must be provided to the `joint_matrix` interface as an additional argument when the user desires bf16 or tf32 to be used as the A, B matrix element data type.  A future implementation will make use of the https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt[cvt PTX instruction] to cast the fp32 elements to either the tf32 or bf16 type.
-
-```c++
-namespace sycl::ext::intel::experimental::matrix {
-enum class matrix_type
-{
-    tf32,
-    bf16
-};
-}
-```
-
-### Clarify USM compatibility
-
-multi_ptr can be constructed from T* since https://github.com/intel/llvm/pull/1183.  However currently this cannot be used with USM for all cases.
-It is expected that eventually the `joint_matrix_load` and `joint_matrix_store` interfaces will be fully compatible with USM.  Currently USM has only been validated to work with this extension for a single case: using shared USM pointers by casting them to the global address space in the following way:
-
-```c++
-joint_matrix_load(sg, sub_c, global_ptr<double>(d_C) + (m * M) * BIG_N + n * N, STRIDE_C);
-```
-
-Where d_C is a shared USM pointer, e.g.:
-
-```c++
-double*  d_C  = malloc_shared<double>(size, queue);
-```
-
-However even this case is not reliable and requires more testing.
-
-### Ensuring that non-portable cases provide intelligible error messages to users.
-
-This extension proposal is intended to be compatible with a hypothetical AMX implementation.  However this requirement necessitates the inclusion of `matrix_layout::packed` which is incompatible with the CUDA implementations of `joint_matrix`, `joint_matrix_load`, `joint_matrix_store`, and `joint_matrix_mad`.  Similar portability issues would occur in the other direction once cases dealing with the alternative CUDA floating point types, tf32 and bf16, are implemented.  In addition, more backends are expected to support the matrix extension in the future.  This means that a common means of reporting errors that result from users attempting to e.g. port code written for AMX using the packed format to CUDA, needs to be defined in a more mature version of the matrix extension.
-
-### Implementation of hardware generation specific mma ptx instructions
-
-It should be decided whether mma ptx instructions are to be a default optimization when available, or whether the dpc++ programmer should decide whether to use these potential optimizations.
-
-## TODO List
-
-- Add an implementation for matrix multiplication using the tf32 and bf16 types.
-- Add remaining shapes/data types for wmma instructions.
-- Verify that USM is fully compatible once a USM pointer can be generally correctly cast to multi_ptr.
-- Work out and maintain a common interface with AMX (and other archs).
-- Optimize for specific Nvidia hardware using mma ptx instructions.
-
-## Revision History
-
-[frame="none",options="header"]
-|======================
-|Rev |Date       |Author     |Changes
-|1   | |Jack Kirk |Initial public working draft.
-|======================

From 9e7051207c5ec07b54fd1fbab2dcfdaabca16972 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 15 Nov 2021 20:17:18 +0000
Subject: [PATCH 07/16] Separated Nvidia implementation descriptions etc.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 148 ++++++++++++------
 sycl/doc/extensions/README.md                 |   4 +-
 2 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 8c97a9edc9f5..40ec8276dab7 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -33,11 +33,11 @@ SYCL specification refer to that revision.
 
 **_NOTE:_** _This document describes the current design and API for the matrix
 extension to {dpcpp}. This is an initial experimental version to try out functionality
-and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX), DPAS, and Nvidia Tensorcore. We are going to work with the community on incrementally improving
+and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX), DPAS, and Nvidia® Tensorcore. We are going to work with the community on incrementally improving
 the API to bring them closer to standard C++ (aligned with the `std::mdspan` and `std::mdarray` proposals) and SYCL in the next several months._
 
 ## Introduction
-This document presents an ongoing work towards defining a unified matrix interface. This interface is intended to unify different tensor hardware: Intel AMX in CPUs, Habana Gaudi and Goya tensor and gemm cores, Nvidia TPUs, IBM Power MMA. All these hardware provide low-level intrinsics or assembly to access and perform matrix operations. The goal is to provide a unified interface that is portable but also benefits from the maximum performance these different hardware can offer.
+This document presents an ongoing work towards defining a unified matrix interface. This interface is intended to unify different tensor hardware: Intel AMX in CPUs, Habana Gaudi and Goya tensor and gemm cores, Nvidia® TPUs, IBM Power MMA. All these hardware provide low-level intrinsics or assembly to access and perform matrix operations. The goal is to provide a unified interface that is portable but also benefits from the maximum performance these different hardware can offer.
 
 ## Feature test macro
 
@@ -54,11 +54,28 @@ value to determine which of the extension's APIs the implementation supports.
 |Value |Description
 |1     |Initial extension implementation on Intel AMX.  Base features are supported.
 |2     |Initial extension JIT implementation on Intel AMX and DPAS. load, store, mad and the query interface are supported 
-|3     |Initial extension implementation on Nvidia Tensorcore. Base features are supported.
+|3     |Initial extension implementation on Nvidia® Tensor Cores. Base features are supported.
 |======================
 
 ## New `joint_matrix` class
-We introduce a new class called `joint_matrix`. The user needs to specify the type of the elements, matrix_use (a, b, accumulator), shape, the memory layout, and the memory scope of the matrix. This results into the following description:
+
+We introduce a new class called `joint_matrix`. The user needs to specify the type of the elements, shape, the memory layout, and the memory scope of the matrix. This results in the following description:
+
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+template <typename T, size_t Rows=sycl::dynamic_extent, size_t Cols=sycl::dynamic_extent,
+          matrix_layout Layout = matrix_layout::row_major, typename Group = sub_group>
+struct joint_matrix {
+  joint_matrix(Group g) {}
+};
+}
+```
+
+### New `matrix_use` Enumeration used in Nvidia® case
+
+A new parameter has been added to the Tensor Core implementation of joint_matrix.  The type of the new parameter is an enum, `matrix_use` (a, b, accumulator).
+`matrix_use` is used to distinguish the suitable place for a given matrix in a Multiply and Add operation. The Multiply and Add operation is decribed later on in this document. The long term plan is to incorporate the use argument for other TPUs (AMX and DPAS). In the Tensor Cores architecture the supported matrix shapes and data types depend upon the `matrix_use`.
+The joint_matrix interface used by the Tensor Cores implementation is:
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -66,51 +83,35 @@ template <typename T, matrix_use MT,
           size_t Rows = sycl::dynamic_extent,
           size_t Cols = sycl::dynamic_extent,
           matrix_layout Layout = matrix_layout::row_major,
-          typename Group, typename Cond = void>
+          typename Group>
 struct joint_matrix {
   joint_matrix(Group g) {}
 };
 }
 ```
 
-The final optional conditional argument can be used to remove cases for template parameter values which are incompatible with a particular backend.
-For example, the Nvidia Tensorcore backend does not allow usage of `matrix_layout::packed_a` or `matrix_layout::packed_b`.
-
-
 #### Memory Scope
 In this experimental API version, we used the terminology of `joint_matrix` instead of plain `matrix` to emphasize that the matrix is shared among a group of work items and is not private to each work item. The memory scope is added as an additional template parameter and is also part of the constructor arguments.
 
-IMPORTANT: In the current implementation, only the subgroup scope is supported.  For Nvidia Tensorcore only the subgroup scope is supported.
+IMPORTANT: In the current implementations of Intel AMX, Intel DPAS, and Nvidia® Tensor Cores, only the subgroup scope is supported.
 
 When the group is a `sycl::sub_group`, an example matrix declaration is as follows:
 
 ```c++
-joint_matrix<double, matrix_use::accumulator, M, N, matrix_layout::col_major, sub_group> tC(sg);
+joint_matrix<int8_t, tM, tN> tA(sg);
 ```
 
-#### Shape
+In the Nvidia case an example declaration adds the matrix_use parameter, e.g.:
 
-MMA operations multiply matrices A (matrix_use::a) (M, K) and B (matrix_use::b) (K, N) and add the result to matrix C (matrix_use::accumulator) (M, N). The logical sizes are M, K, N.
+```c++
+joint_matrix<double, matrix_use::a, tM, tN> tA(sg);
+```
 
-C = A * B + C
+#### Shape
 
 The same class `joint_matrix` should handle both cases where sizes are constant (GPU case) and when sizes are variables (CPU case). Note that a Intel AMX 2d tile register permits sizes up to 1024 (16rowsx64cols) bytes. The ability to define only one interface for both makes it possible to give the user a way to make use of the flexibility introduced by the CPU but at the same time save resources on the GPU. We use `sycl::dynamic_extent`  to differentiate between static and dynamic sizes.
 
-IMPORTANT: In the current implementation, only the static extent is supported
-
-**_NOTE:_** _Nvidia tensorcore instructions only support a discrete set of matrix shapes. The supported matrix shapes correspond with particular data types of the "a", "b", and "accumulator" matrices involved in the MMA operation._
-
-##### Current Nvidia Implementation status
-
-Currently only a single case, where all matrices consist of elements of double (fp64) type, is implemented.  All other available matrix types/shapes will be added shortly.
-
-[frame="none",options="header"]
-|======================
-|"a" data type |"b" data type |"accumulator" data type |M       |N     |K | required SM version
-|double |double |double   |8 |8 |4| sm_80
-
-
-|======================
+IMPORTANT: In the current implementation, only the static extent is supported.
 
 #### Layout
 Besides row major and column major layouts, `matrix_layout` is flexible enough to introduce custom layouts such as symmetric or tiled layouts.
@@ -126,14 +127,14 @@ enum class matrix_layout {
 }
 ```
 
-Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed layout.  However, the packed shape for B tile uses the VNNI format, which is described below. The user must provide the information of packed_b layout to make the implementation allocate the right shape. The layout information for Intel AMX should be specified in user code as follows:
+Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed layout. If we multiply matrices A (M, K) and B (K, N) into a matrix C (M, N). The logical sizes are M, K, N. However, the packed shape for B tile uses the VNNI format, which is described below. The user must provide the information of packed_b layout to make the implementation allocate the right shape. The layout information for Intel AMX should be specified in user code as follows:
 
 ```c++
-joint_matrix<int8_t, matrix_use::b, K, N, packed_b> tB(sg);
-```   
-IMPORTANT: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
+joint_matrix<int8_t, K, N, packed_b> tB(sg);
+```
 
-IMPORTANT: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`. The "packed" layouts are only applicable to the AMX implementation: matrix_layout::packed_a and matrix_layout::packed_b are not required by the implementation of Nvidia wmma and mma instructions.
+IMPORTANT: * AMX and DPAS: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
+           * Tensor Cores: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`.
 
 
 ## Matrix Operations and their Execution Scope
@@ -147,13 +148,30 @@ IMPORTANT: In the current AMX and DPAS implementation, the layout in the load of
 
 Since the matrix functions are group operations (as defined in Section 4.17.3 of the SYCL specification), the matrix API has to be accessed by all the work-items in the group in a convergent control flow. The `Group` template argument can be a work-group or a subgroup. These functions will be called once by each work item in the group.
 
-**_NOTE:_** _The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-group, and also the default DPC++ sub-group size for the CUDA backend._
+IMPORTANT: The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-group, and also the default DPC++ sub-group size for the CUDA backend.
 
 To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
 
 IMPORTANT: In the current implementation, only the subgroup scope is supported.  
 
 #### Load 
+
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+  template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix_layout Layout,
+          access::address_space Space>
+  void joint_matrix_load(Group sg, joint_matrix<T, NumRows, NumCols, Layout, Group> &res,
+		    multi_ptr<T, Space> src, size_t stride, matrix_layout MemLayout);
+}
+```
+
+This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS or Nvidia® Tensor Cores.
+
+#### Tensor Cores syntax
+
+In the Tensor Cores implementation the matrix_use enum is added:
+
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
 template <typename Group, typename T, matrix_use MT, size_t NumRows,
@@ -163,9 +181,25 @@ void joint_matrix_load(
     multi_ptr<T, Space> src, size_t stride);
 }
 ```
-This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS or to the matrix "fragments" for Nvidia Tensorcore.  Note that `Layout` is not included as an argument since it may be determined from the joint_matrix argument.
+
+Note that `Layout` is not included as an argument since it may be determined from the joint_matrix argument.
 
 #### Store 
+
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+  template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix_layout L,
+          access::address_space Space>
+  void joint_matrix_store(Group sg, joint_matrix<T, NumRows, NumCols, L, Group> &res,
+		     multi_ptr<T, Space> src, size_t stride, matrix_layout memL);
+}
+```
+
+This function stores the data from the 2d tiles/registers back to memory.
+
+##### Tensor Cores syntax
+
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
 template <typename Group, typename T, size_t NumRows, size_t NumCols,
@@ -176,10 +210,27 @@ void joint_matrix_store(Group sg,
                         multi_ptr<T, Space> dst, size_t stride);
 }
 ```
-This function stores the data from the 2d tiles/"fragments" back to memory.  This function is only available for matrix_use::accumulator.
+Note that this function is only available for matrix_use::accumulator.
 
 #### Multiply and Add
 
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+  template <typename Group, typename Ta, typename Tb, typename Tc,
+          std::size_t M, std::size_t K, std::size_t N,
+	  matrix_layout La, matrix_layout Lb,
+          matrix_layout Lc>
+  joint_matrix<Group, Tc, M, N, Lc> joint_matrix_mad(Group sg, joint_matrix<Ta, M, K, La, Group> A,
+               joint_matrix<Tb, K, N, Lb, Group> B, joint_matrix<Tc, M, N, Lc, Group> C);
+}
+```
+
+Multiply and Add (MAD) operations, also known as Matrix Multiply and Accumulate (MMA), multiply matrices A (matrix_use::a) with shape (M, K) and B (matrix_use::b) with shape (K, N) and add the result to matrix C (matrix_use::accumulator) with shape (M, N). The logical sizes are M, K, N.
+
+C = A * B + C
+
+##### Tensor Cores syntax
+
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
 template <typename Group, typename T1, typename T2, std::size_t M,
@@ -192,8 +243,6 @@ joint_matrix_mad(
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C);
 }
 ```
-The matrix multiply and add function performs the multiply operation on the matrices `A` and `B`, accumulates the result with `C` and returns the result.
-
 
 ## VNNI/Packed Layout
 Intel AMX and DPAS compute assumes register for B tile (src1) to be in VNNI format as they need 32bit of K-data in A and B to be contiguous in memory.
@@ -265,10 +314,9 @@ q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item)
 }).wait();
 ```
 
-## Nvidia Compilation instructions
+IMPORTANT: When compiling for Nvidia® the matrix extension requires specification of the architecture version.  The compilation invocation is given below.
 
-When compiling for Nvidia the matrix extension requires specification of the architecture version.
-This is the compilation command line needed to invoke the matrix extension on program "matrix-nvidia.cpp":
+This is the compilation command line needed to invoke the Tensor Cores matrix extension on program "matrix-nvidia.cpp":
 
 ```c++
 clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 matrix-nvidia.cpp -o output
@@ -276,7 +324,7 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-g
 **_NOTE:_** _--cuda-gpu-arch may be set lower than sm_80 depending on the required matrix operation and whether it is supported by the desired arch._
 
 == Query Interface
-Intel AMX, DPAS and Nvidia TPUs support different sizes and types. 
+Intel AMX, DPAS and Nvidia® TPUs support different sizes and types.
 The query interface is used to validate user code and inform them about supported types, sizes, scope, and layouts by the implementation.
 This also offers development and tuning productivity by both scientists and library developers. The query interface we are proposing here is a compile-time query, 
 so there will be no runtime errors.   
@@ -288,7 +336,7 @@ The query interface proposed here consists of three functionalities:
 
 - General query: the general query interface provides information  about sizes, types, static/dynamic, and scopes that are supported by a specific TPU implementation. This is needed to avoid padding by the user, for tuning, and efficient code generation if used by a library. The general query return an array of `combinations` of `combination` type. Each combination includes the sizes and the types for the matrices A, B, and C. Note that for each TPU, the query returns `max_msize, max_nsize, max_ksize` or `msize, nsize, ksize` exclusively depending whether the implementation supports a continuous or discrete number of sizes. For example, Intel AMX implementation supports a continuous number of sizes so the `max_*` variant is applied and only the maximum number is returned. DPAS implementation, on the other hand, supports a discrete list of numbers so the  `msize, nsize, ksize` variant is applied.  This form takes place when users only specify the TPU they are interested in using.
 
-The table below provides a description for each of the member variables and type aliases in `tpu_params` class and the forms in which  they are defined.
+The table below provides a description for each of the member variables and type aliases in `tpu_params` class and the forms in which they are defined.
 
 [frame="none",options="header"]
 |======================
@@ -581,7 +629,7 @@ joint_matrix<ONEAPI::sub_group, int, 8, 8> C(sg);
 The problem with this option is that it is restrictive to a very limited set of operations. 
 
 #### Option3: Restrictive conversion in the interface from SIMD to SPMD
-Nvidia wmma interface added a new member to `fragment` class to designate the WI owned part of the matrix. 
+Nvidia® wmma interface added a new member to `fragment` class to designate the WI owned part of the matrix.
 While this provides fast element indexing on the GPU compared to the non-restrictive option, the user does not know the mapping of the owned data to the original matrix. 
  However using the `mma` ptx instructions as opposed to the `wmma` ptx instructions the mapping is known. Knowing this mapping is important for the user to implement new operations like sum of rows of a matrix for quantized algorithms.
 
@@ -623,14 +671,20 @@ We did not utilize this extension for this matrix API version because sub-group
 - In the future looking APIs, `get_wi_slice` (that is currently under design) returns an owned object. Should this return a view object to make sure the original matrix C is changed after its slices are modified.
 - multi_ptr can be constructed from T* since https://github.com/intel/llvm/pull/1183.  However currently this cannot be used with USM for all cases.
   It is expected that eventually the `joint_matrix_load` and `joint_matrix_store` interfaces will be fully compatible with USM.
+
 ## TODO List
 - Add support for fill matrix and element-wise operations features
 - Change the names default sizes in the query from defaultM, defaultN, defaultK to M,N,K
 - Change the type of `scope` in the query interface to be able to return more than one value. This will be useful in the event we support other scopes like workgroup besides subgroups
 - Add a more realistic and complete example that shows the value of the general query
+
+Tensor Cores:
+
 - Clarify USM compatibility
-- Add Nvidia case to the query interface, and consider how the future looking API can be implemented for Nvidia.
+- Add support for other combinations, the query interface, and consider how the future looking API can be added here
 
+  AMX and DPAS:
+- Add 'matrix_use' parameter to the matrix to distinguish between matrix A, B, and matrix accumulator. This is necessary for supporting VNNI and transpose transform
 
 ## Revision History
 
@@ -639,5 +693,5 @@ We did not utilize this extension for this matrix API version because sub-group
 |Rev |Date       |Author     |Changes
 |1   |2021-04-13 |Dounia Khaldi |Initial public working draft.
 |2   |2021-10-05 |Dounia Khaldi |JIT implementation on both Intel AMX and DPAS
-|3   |2021-11-08 |Jack Kirk |Added matrix_use
+|3   |2021-11-08 |Jack Kirk |Initial AOT use case on Nvidia® Tensor Cores
 |======================
diff --git a/sycl/doc/extensions/README.md b/sycl/doc/extensions/README.md
index 5e17f716c845..59fa006158bb 100755
--- a/sycl/doc/extensions/README.md
+++ b/sycl/doc/extensions/README.md
@@ -42,7 +42,9 @@ DPC++ extensions status:
 | [Invoke SIMD](InvokeSIMD/InvokeSIMD.asciidoc)                                                                               | Proposal                                  | |
 | [Uniform](Uniform/Uniform.asciidoc)                                                                                         | Proposal                                  | |
 | [Assert](Assert/SYCL_ONEAPI_ASSERT.asciidoc)                                                                                | Proposal                                  | |
-| [Matrix](Matrix/dpcpp-joint-matrix.asciidoc)                                                                                | Partially supported(AMX AOT)              | Not supported: dynamic-extent, wg and wi scopes, layouts other than packed|
+| [Matrix](Matrix/dpcpp-joint-matrix.asciidoc)                                                                                | Partially supported (AMX, DPAS, Tensor Cores (AOT))              | Not supported: AMX, DPAS, Tensor Cores: dynamic-extent, wg and wi scopes; Not supported: AMX, DPAS: layouts other than packed; Tensor Cores: The only supported combination is
+"a" data type |"b" data type |"accumulator" data type |M |N |K | required SM version
+|double |double |double |8 |8 |4| sm_80
 | [SYCL_INTEL_free_function_queries](FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc)                           | Supported (experimental)                  | |
 | [EXT_ONEAPI_max_work_groups](MaxWorkGroupQueries/max_work_group_query.md)                                                   | Supported                                 | |
 | [SYCL_EXT_ONEAPI_DEVICE_GLOBAL](DeviceGlobal/SYCL_INTEL_device_global.asciidoc)                                             | Proposal                                  | |

From 23f7f1ff0653b03344a233c44a0281683df1fe6e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 15 Nov 2021 20:21:08 +0000
Subject: [PATCH 08/16] Removed nested table.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 sycl/doc/extensions/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sycl/doc/extensions/README.md b/sycl/doc/extensions/README.md
index 59fa006158bb..a0041cf970d6 100755
--- a/sycl/doc/extensions/README.md
+++ b/sycl/doc/extensions/README.md
@@ -42,9 +42,7 @@ DPC++ extensions status:
 | [Invoke SIMD](InvokeSIMD/InvokeSIMD.asciidoc)                                                                               | Proposal                                  | |
 | [Uniform](Uniform/Uniform.asciidoc)                                                                                         | Proposal                                  | |
 | [Assert](Assert/SYCL_ONEAPI_ASSERT.asciidoc)                                                                                | Proposal                                  | |
-| [Matrix](Matrix/dpcpp-joint-matrix.asciidoc)                                                                                | Partially supported (AMX, DPAS, Tensor Cores (AOT))              | Not supported: AMX, DPAS, Tensor Cores: dynamic-extent, wg and wi scopes; Not supported: AMX, DPAS: layouts other than packed; Tensor Cores: The only supported combination is
-"a" data type |"b" data type |"accumulator" data type |M |N |K | required SM version
-|double |double |double |8 |8 |4| sm_80
+| [Matrix](Matrix/dpcpp-joint-matrix.asciidoc)                                                                                | Partially supported (AMX, DPAS, Tensor Cores (AOT))              | Not supported: AMX, DPAS, Tensor Cores: dynamic-extent, wg and wi scopes; Not supported: AMX, DPAS: layouts other than packed; Tensor Cores: The only supported data type is double
 | [SYCL_INTEL_free_function_queries](FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc)                           | Supported (experimental)                  | |
 | [EXT_ONEAPI_max_work_groups](MaxWorkGroupQueries/max_work_group_query.md)                                                   | Supported                                 | |
 | [SYCL_EXT_ONEAPI_DEVICE_GLOBAL](DeviceGlobal/SYCL_INTEL_device_global.asciidoc)                                             | Proposal                                  | |

From d818faf988feea8534697e17a2b61251467965be Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 15 Nov 2021 20:33:08 +0000
Subject: [PATCH 09/16] Typos

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 40ec8276dab7..70ebea938e00 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -74,7 +74,7 @@ struct joint_matrix {
 ### New `matrix_use` Enumeration used in Nvidia® case
 
 A new parameter has been added to the Tensor Core implementation of joint_matrix.  The type of the new parameter is an enum, `matrix_use` (a, b, accumulator).
-`matrix_use` is used to distinguish the suitable place for a given matrix in a Multiply and Add operation. The Multiply and Add operation is decribed later on in this document. The long term plan is to incorporate the use argument for other TPUs (AMX and DPAS). In the Tensor Cores architecture the supported matrix shapes and data types depend upon the `matrix_use`.
+`matrix_use` is used to distinguish the suitable place for a given matrix in a Multiply and Add operation. The Multiply and Add operation is described later on in this document. The long term plan is to incorporate the use argument for other TPUs (AMX and DPAS). In the Tensor Cores architecture the supported matrix shapes and data types depend upon the `matrix_use`.
 The joint_matrix interface used by the Tensor Cores implementation is:
 
 ```c++
@@ -101,7 +101,7 @@ When the group is a `sycl::sub_group`, an example matrix declaration is as follo
 joint_matrix<int8_t, tM, tN> tA(sg);
 ```
 
-In the Nvidia case an example declaration adds the matrix_use parameter, e.g.:
+In the Nvidia case an example declaration adds the `matrix_use` parameter, e.g.:
 
 ```c++
 joint_matrix<double, matrix_use::a, tM, tN> tA(sg);
@@ -133,8 +133,8 @@ Intel AMX and DPAS hardware require B matrix to be in VNNI or 32 bits packed lay
 joint_matrix<int8_t, K, N, packed_b> tB(sg);
 ```
 
-IMPORTANT: * AMX and DPAS: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
-           * Tensor Cores: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`.
+IMPORTANT: AMX and DPAS: In the current AMX and DPAS implementation, only `packed_b` layout is necessary to specify on matrix B, the layout on other matrices is ignored.
+           Tensor Cores: For the CUDA backend the layout in the load of matrices A B and C must be either `row_major` or `col_major`, and `Layout` in the store of matrix C must also be either `row_major` or `col_major`.
 
 
 ## Matrix Operations and their Execution Scope
@@ -150,9 +150,7 @@ Since the matrix functions are group operations (as defined in Section 4.17.3 of
 
 IMPORTANT: The CUDA backend does not require any other sub-group size than 32, which is the size of a warp which acts as the sub-group, and also the default DPC++ sub-group size for the CUDA backend.
 
-To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following: 
-
-IMPORTANT: In the current implementation, only the subgroup scope is supported.  
+To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following:
 
 #### Load 
 
@@ -170,7 +168,7 @@ This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS
 
 #### Tensor Cores syntax
 
-In the Tensor Cores implementation the matrix_use enum is added:
+In the Tensor Cores implementation the `matrix_use` enum is added:
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -210,7 +208,7 @@ void joint_matrix_store(Group sg,
                         multi_ptr<T, Space> dst, size_t stride);
 }
 ```
-Note that this function is only available for matrix_use::accumulator.
+Note that this function is only available for `matrix_use::accumulator`.
 
 #### Multiply and Add
 
@@ -225,7 +223,7 @@ namespace sycl::ext::oneapi::experimental::matrix {
 }
 ```
 
-Multiply and Add (MAD) operations, also known as Matrix Multiply and Accumulate (MMA), multiply matrices A (matrix_use::a) with shape (M, K) and B (matrix_use::b) with shape (K, N) and add the result to matrix C (matrix_use::accumulator) with shape (M, N). The logical sizes are M, K, N.
+Multiply and Add (MAD) operations, also known as Matrix Multiply and Accumulate (MMA), multiply matrices A (`matrix_use::a`) with shape (M, K) and B (`matrix_use::b`) with shape (K, N) and add the result to matrix C (`matrix_use::accumulator`) with shape (M, N). The logical sizes are M, K, N.
 
 C = A * B + C
 
@@ -619,7 +617,7 @@ for (int i = 0; i < 8; i++)
  for (int j = 0; j < 8; j++)
    C(i,j) *= alpha; //Align with mdspan
 ```
-#### Option2: Restrictive fast element indexing 
+#### Option 2: Restrictive fast element indexing
 In the DPC++ context, the expectation is that all element-wise operations will happen in a converged control path by all work items in the group.
 Option 2 proposes a new set of element-wise operations by overloading existing operations to work on `matrix` object. An example is shown below:
 ```c++
@@ -628,7 +626,7 @@ joint_matrix<ONEAPI::sub_group, int, 8, 8> C(sg);
 ```
 The problem with this option is that it is restrictive to a very limited set of operations. 
 
-#### Option3: Restrictive conversion in the interface from SIMD to SPMD
+#### Option 3: Restrictive conversion in the interface from SIMD to SPMD
 Nvidia® wmma interface added a new member to `fragment` class to designate the WI owned part of the matrix.
 While this provides fast element indexing on the GPU compared to the non-restrictive option, the user does not know the mapping of the owned data to the original matrix. 
  However using the `mma` ptx instructions as opposed to the `wmma` ptx instructions the mapping is known. Knowing this mapping is important for the user to implement new operations like sum of rows of a matrix for quantized algorithms.
@@ -665,7 +663,7 @@ We did not utilize this extension for this matrix API version because sub-group
 ## Open Questions
 - Besides row, col major and packed (VNNI) layout, what are the additional layouts that should absolutely be added?
 - Are there alternative names for the `packed_a` and `packed_b` layouts that would be clearer to distinguish between the VNNI Layout in matrix A and VNNI layout in matrix B of a matrix multiply and add operation on Intel AMX?
--- Yes, this will be addressed in the next revision where `use` argument will be introduced to distinguish between right (B) , left (A), and accumulator matrix. 
+-- Yes, this will be addressed in the next revision where `matrix_use` argument will be introduced to distinguish between right (B) , left (A), and accumulator matrix.
 - Ronan Keryell: "It would be interesting to investigate whether providing also member functions would simplify the API. Provide both so it is possible to use the best one for each use case, while waiting for https://en.wikipedia.org/wiki/Uniform_Function_Call_Syntax to land into C++?"
 
 - In the future looking APIs, `get_wi_slice` (that is currently under design) returns an owned object. Should this return a view object to make sure the original matrix C is changed after its slices are modified.
@@ -683,7 +681,7 @@ Tensor Cores:
 - Clarify USM compatibility
 - Add support for other combinations, the query interface, and consider how the future looking API can be added here
 
-  AMX and DPAS:
+AMX and DPAS:
 - Add 'matrix_use' parameter to the matrix to distinguish between matrix A, B, and matrix accumulator. This is necessary for supporting VNNI and transpose transform
 
 ## Revision History

From b3064e5db7e3c02c42e033ac53c23d77be44043e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 15 Nov 2021 20:40:03 +0000
Subject: [PATCH 10/16] Formatting

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 70ebea938e00..b59045f2e2cc 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -152,7 +152,7 @@ IMPORTANT: The CUDA backend does not require any other sub-group size than 32, w
 
 To be aligned with the SYCL 2020 group algorithms, an additional group argument is added to the matrix operations to designate that these functions are collective operations. The {dpcpp} syntax is the following:
 
-#### Load 
+### Load
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -166,7 +166,7 @@ namespace sycl::ext::oneapi::experimental::matrix {
 
 This function loads data from memory to the 2d tiles/registers of Intel AMX/DPAS or Nvidia® Tensor Cores.
 
-#### Tensor Cores syntax
+##### Tensor Cores syntax
 
 In the Tensor Cores implementation the `matrix_use` enum is added:
 
@@ -182,7 +182,7 @@ void joint_matrix_load(
 
 Note that `Layout` is not included as an argument since it may be determined from the joint_matrix argument.
 
-#### Store 
+### Store
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -210,7 +210,7 @@ void joint_matrix_store(Group sg,
 ```
 Note that this function is only available for `matrix_use::accumulator`.
 
-#### Multiply and Add
+### Multiply and Add
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {

From cc10a11e02e9a02bf6a66357aae9c1f29e46fc9a Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 16 Nov 2021 10:09:50 +0000
Subject: [PATCH 11/16] Small formatting improvements.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index b59045f2e2cc..37bfe1ad53bd 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -33,7 +33,7 @@ SYCL specification refer to that revision.
 
 **_NOTE:_** _This document describes the current design and API for the matrix
 extension to {dpcpp}. This is an initial experimental version to try out functionality
-and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX), DPAS, and Nvidia® Tensorcore. We are going to work with the community on incrementally improving
+and performance, and **future versions of this API may change in ways that are incompatible with this experimental version**. The current implementation provides support of the matrix interface on Intel(R) Advanced Matrix Extensions (AMX), DPAS, and Nvidia® Tensor Cores. We are going to work with the community on incrementally improving
 the API to bring them closer to standard C++ (aligned with the `std::mdspan` and `std::mdarray` proposals) and SYCL in the next several months._
 
 ## Introduction
@@ -101,7 +101,7 @@ When the group is a `sycl::sub_group`, an example matrix declaration is as follo
 joint_matrix<int8_t, tM, tN> tA(sg);
 ```
 
-In the Nvidia case an example declaration adds the `matrix_use` parameter, e.g.:
+In the Tensor Cores case an example declaration adds the `matrix_use` parameter, e.g.:
 
 ```c++
 joint_matrix<double, matrix_use::a, tM, tN> tA(sg);
@@ -682,6 +682,7 @@ Tensor Cores:
 - Add support for other combinations, the query interface, and consider how the future looking API can be added here
 
 AMX and DPAS:
+
 - Add 'matrix_use' parameter to the matrix to distinguish between matrix A, B, and matrix accumulator. This is necessary for supporting VNNI and transpose transform
 
 ## Revision History

From 9814f97c0c602c886499dc00473cb2b4474b0d3e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 20 Jan 2022 10:53:10 +0000
Subject: [PATCH 12/16] Added description of single-bit Nvidia bmad.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 65 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 37bfe1ad53bd..29e5f7a689cc 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -54,7 +54,7 @@ value to determine which of the extension's APIs the implementation supports.
 |Value |Description
 |1     |Initial extension implementation on Intel AMX.  Base features are supported.
 |2     |Initial extension JIT implementation on Intel AMX and DPAS. load, store, mad and the query interface are supported 
-|3     |Initial extension implementation on Nvidia® Tensor Cores. Base features are supported.
+|3     |Initial extension implementation on Nvidia® Tensor Cores. load, store, mad, and bmad are supported. bf16, fp19, mixed precision float, mixed precision u(int), double, and single-bit data formats are supported.
 |======================
 
 ## New `joint_matrix` class
@@ -242,6 +242,69 @@ joint_matrix_mad(
 }
 ```
 
+### Bitwise Multiply and Add - `joint_matrix_bmad` (Nvidia only)
+
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+template <typename Group, typename T1, typename T2, std::size_t M,
+          std::size_t K, std::size_t N, matrix_layout LayoutA,
+          matrix_layout LayoutB, matrix_layout LayoutC, class BinaryOperation>
+joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
+joint_matrix_bmad(
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group> B,
+    joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C, BinaryOperation Op);
+}
+```
+
+Bitwise Multiply and Add (BMAD) operations replace the usual dot product of a row of matrix A (M by K) with a column of matrix B (K by N).  Instead a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a K bit row of matrix A with the ith bit of a K bit column of matrix B to produce a 128 Bit intermediate output.
+The Population Count (popc) operator then operates on this intermediate output and the result is added with the (M, N)th element of the accumulator matrix C. Currently only the shape M = 8, N = 8, K = 128 is supported.
+The only change with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
+
+`sycl::bit_and<uint32_t>()`
+
+or
+
+`sycl::bit_xor<uint32_t>()`
+
+The A, B, and C `joint_matrix` objects are constructed and loaded/stored in the normal way, using the previously defined `joint_matrix`, `joint_matrix_load`, and `joint_matrix_store` interfaces.
+The C matrix must be loaded from an array of 32 bit signed integers, and the A, B single bit matrices must be loaded from an array of unsigned 32-bit integers.
+
+IMPORTANT: when using Bitwise Multiply and Add matrix A must be in row major format and matrix B must be in column major format.
+
+#### Example using bitwise operations with `joint_matrix_bmad`
+
+```c++
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+queue q;
+  q.submit([&](handler &cgh) {
+    auto accC = bufC.template get_access<access::mode::read_write>(cgh);
+    auto accA = bufA.template get_access<access::mode::read_write>(cgh);
+    auto accB = bufB.template get_access<access::mode::read_write>(cgh);
+    auto accD = bufD.template get_access<access::mode::read_write>(cgh);
+    range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
+    range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
+    cgh.parallel_for<KernelName<M, K, N, BinaryOperation>>(
+        nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+          sycl::sub_group sg = item.get_sub_group();
+          const auto m = item.get_group().get_id()[0]; // row id of current submatrix of BIG C matrix
+          const auto n = item.get_group().get_id()[1]; // column id of current submatrix of BIG C matrix
+          joint_matrix<uint32_t, matrix_use::a, 8, 128, matrix_layout::row_major> sub_a;
+          joint_matrix<uint32_t, matrix_use::b, 128, 8, matrix_layout::col_major> sub_b;
+          joint_matrix<int32_t, matrix_use::accumulator, 8, 8, matrix_layout::row_major> sub_c;
+          joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+          for (int k = 0; k < Sub_Tiles_K; k++) // row/col id of current submatrix of BIG A/B matrices
+          {
+            joint_matrix_load(sg, sub_a, accA.get_pointer() + (k * K / 32) + (m * M * Big_K / 32), Big_K);
+            joint_matrix_load(sg, sub_b, accB.get_pointer() + (n * N * Big_K / 32) + (k * K / 32), Big_K);
+            sub_c = joint_matrix_bmad(sg, sub_a, sub_b, sub_c, Op);
+          }
+          joint_matrix_store(sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+        });
+  }).wait();
+```
+
 ## VNNI/Packed Layout
 Intel AMX and DPAS compute assumes register for B tile (src1) to be in VNNI format as they need 32bit of K-data in A and B to be contiguous in memory.
 The VNNI blocking factor is 2 in the case of 16-bit types, and it is 4 in the case of 8-bit types. While the current implementation assumes that the matrix has been already packed by the user for performance reasons, the layout information is needed to inform the implementation about this transform.  The following example illustrates how a matrix in `row_major` layout is transformed into the `packed_b` layout for a 16-bit type.

From 9615ed2ed744cea987e9d952ca253b13958faf03 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 21 Jan 2022 11:46:17 +0000
Subject: [PATCH 13/16] Added a summary of BNNs, as motivation for BMADs

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../extensions/Matrix/dpcpp-joint-matrix.asciidoc | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 29e5f7a689cc..cbdfe54362f0 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -242,7 +242,7 @@ joint_matrix_mad(
 }
 ```
 
-### Bitwise Multiply and Add - `joint_matrix_bmad` (Nvidia only)
+### Bitwise Multiply and Add - `joint_matrix_bmad` (Nvidia® only)
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -270,7 +270,18 @@ or
 The A, B, and C `joint_matrix` objects are constructed and loaded/stored in the normal way, using the previously defined `joint_matrix`, `joint_matrix_load`, and `joint_matrix_store` interfaces.
 The C matrix must be loaded from an array of 32 bit signed integers, and the A, B single bit matrices must be loaded from an array of unsigned 32-bit integers.
 
-IMPORTANT: when using Bitwise Multiply and Add matrix A must be in row major format and matrix B must be in column major format.
+IMPORTANT: When using Bitwise Multiply and Add matrix A must be in row major format and matrix B must be in column major format.
+IMPORTANT: Bitwise Multiply and Add operations are an experimental hardware feature and all implementation details are subject to change.
+
+#### Motivation for BMAD
+
+Single-bit MADs can be used as part of Binarized Neural Networks (BNNs) in the case that *both* the activations *and* weights are binarized. "Quantizing" a network to form a BNN represents the extreme limit of reducing the precision of the network degrees of freedom in order to gain performance, and reduce power and memory consumption.
+Hubara et al. (I. Hubara, M. Courbariaux, D. Soudry, R. El-Yaniv, and Y. Bengio. Binarized Neural Networks, Advances in Neural Information Processing Systems 29 (NIPS 2016)) first demonstrated the utility of an algorithm that could use both binarized activations and weights with backpropogation, by keeping track of real valued weights which are mapped to the binarized weights. In the backwards pass the real valued weights are updated according to a heuristic named the "Straight Through Estimator", whereby the gradient of the loss function with respect to the real weights is set equal to the gradient of the loss function with respect to the binarized weights.
+This implies that the precision of the data type used in the matrix multiplications can be single bit, with the necessary addition of forward and backward element wise mappings between binarized and real valued representations of the matrices.
+This can prove a significant advantage for large models, since the computational cost of Matrix Multiplication scales with the number of elements per dimension, N, as O(N^3) for square matrices, whereas element wise operations scale as O(N^2).
+Further algorithms based on this binarized approach have been proposed, e.g. see Rastegari et al. (M. Rastegari, V Ordonez, J. Redmon, and A. Farhadi. Computer Vision – ECCV 2016, 525-542) who have made a comparison between a binarized version of a CNN and corresponding full precision models, for both the accuracy and performance of image classification using the ImageNet data set.
+
+For a discussion of how bitwise MADs can be efficiently leveraged on current Nvidia® hardware see (A. Li, and S. Su. IEEE Transactions on Parallel and Distributed Systems, 32(7):1878-1891, 2021).
 
 #### Example using bitwise operations with `joint_matrix_bmad`
 

From 1e963f7b535341f322d2d5517a05c5db08275a1d Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 21 Jan 2022 17:22:25 +0000
Subject: [PATCH 14/16] Simplified the template parameter interface for
 joint_matrix_bmad.

Unnecessary template parameters are removed.

Although only one "shape" is currently supported: m8n8k128, the shape template parameters are still used so that when future shapes are supported by the hardware they can be implemented.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../Matrix/dpcpp-joint-matrix.asciidoc          | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index cbdfe54362f0..a81daddce6f3 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -246,20 +246,19 @@ joint_matrix_mad(
 
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
-template <typename Group, typename T1, typename T2, std::size_t M,
-          std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC, class BinaryOperation>
-joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
+template <typename Group, std::size_t M,
+          std::size_t K, std::size_t N, matrix_layout LayoutC, class BinaryOperation>
+joint_matrix<int32_t, matrix_use::accumulator, M, N, LayoutC, Group>
 joint_matrix_bmad(
-    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group> A,
-    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group> B,
-    joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C, BinaryOperation Op);
+    Group sg, joint_matrix<uint32_t, matrix_use::a, M, K, matrix_layout::row_major, Group> A,
+    joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::col_major, Group> B,
+    joint_matrix<int32_t, matrix_use::accumulator, M, N, LayoutC, Group> C, BinaryOperation Op);
 }
 ```
 
 Bitwise Multiply and Add (BMAD) operations replace the usual dot product of a row of matrix A (M by K) with a column of matrix B (K by N).  Instead a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a K bit row of matrix A with the ith bit of a K bit column of matrix B to produce a 128 Bit intermediate output.
 The Population Count (popc) operator then operates on this intermediate output and the result is added with the (M, N)th element of the accumulator matrix C. Currently only the shape M = 8, N = 8, K = 128 is supported.
-The only change with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
+The main change with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
 
 `sycl::bit_and<uint32_t>()`
 
@@ -270,7 +269,7 @@ or
 The A, B, and C `joint_matrix` objects are constructed and loaded/stored in the normal way, using the previously defined `joint_matrix`, `joint_matrix_load`, and `joint_matrix_store` interfaces.
 The C matrix must be loaded from an array of 32 bit signed integers, and the A, B single bit matrices must be loaded from an array of unsigned 32-bit integers.
 
-IMPORTANT: When using Bitwise Multiply and Add matrix A must be in row major format and matrix B must be in column major format.
+IMPORTANT: When using Bitwise Multiply and Add joint_matrix A must be in row major layout and joint_matrix B must be in column major layout.
 IMPORTANT: Bitwise Multiply and Add operations are an experimental hardware feature and all implementation details are subject to change.
 
 #### Motivation for BMAD

From a42d74874d06b4e0ed71bfcc11fce9b7dbded72c Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Mon, 24 Jan 2022 11:57:30 +0000
Subject: [PATCH 15/16] Improved readability.

---
 .../extensions/Matrix/dpcpp-joint-matrix.asciidoc   | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index a81daddce6f3..1e3085ee8f0e 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -256,9 +256,9 @@ joint_matrix_bmad(
 }
 ```
 
-Bitwise Multiply and Add (BMAD) operations replace the usual dot product of a row of matrix A (M by K) with a column of matrix B (K by N).  Instead a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a K bit row of matrix A with the ith bit of a K bit column of matrix B to produce a 128 Bit intermediate output.
+Bitwise Multiply and Add (BMAD) operations replace the usual dot product between a row of matrix A (M by K) with a column of matrix B (K by N).  Instead, a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a K bit row of matrix A with the ith bit of a K bit column of matrix B, to produce a 128 bit intermediate output.
 The Population Count (popc) operator then operates on this intermediate output and the result is added with the (M, N)th element of the accumulator matrix C. Currently only the shape M = 8, N = 8, K = 128 is supported.
-The main change with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
+The most important difference with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
 
 `sycl::bit_and<uint32_t>()`
 
@@ -266,18 +266,19 @@ or
 
 `sycl::bit_xor<uint32_t>()`
 
-The A, B, and C `joint_matrix` objects are constructed and loaded/stored in the normal way, using the previously defined `joint_matrix`, `joint_matrix_load`, and `joint_matrix_store` interfaces.
+The A, B, and C `joint_matrix` objects are constructed and loaded/stored in the normal way, using the previously defined `joint_matrix`, `joint_matrix_load`, and `joint_matrix_store` interfaces respectively.
 The C matrix must be loaded from an array of 32 bit signed integers, and the A, B single bit matrices must be loaded from an array of unsigned 32-bit integers.
 
 IMPORTANT: When using Bitwise Multiply and Add joint_matrix A must be in row major layout and joint_matrix B must be in column major layout.
+
 IMPORTANT: Bitwise Multiply and Add operations are an experimental hardware feature and all implementation details are subject to change.
 
 #### Motivation for BMAD
 
-Single-bit MADs can be used as part of Binarized Neural Networks (BNNs) in the case that *both* the activations *and* weights are binarized. "Quantizing" a network to form a BNN represents the extreme limit of reducing the precision of the network degrees of freedom in order to gain performance, and reduce power and memory consumption.
-Hubara et al. (I. Hubara, M. Courbariaux, D. Soudry, R. El-Yaniv, and Y. Bengio. Binarized Neural Networks, Advances in Neural Information Processing Systems 29 (NIPS 2016)) first demonstrated the utility of an algorithm that could use both binarized activations and weights with backpropogation, by keeping track of real valued weights which are mapped to the binarized weights. In the backwards pass the real valued weights are updated according to a heuristic named the "Straight Through Estimator", whereby the gradient of the loss function with respect to the real weights is set equal to the gradient of the loss function with respect to the binarized weights.
+Single-bit MADs can be used as part of Binarized Neural Networks (BNNs) in the case that *both* the activations *and* weights are binarized. "Quantizing" a network to form a BNN represents the extreme limit of reducing the precision of the network degrees of freedom in order to gain performance and improve efficiency.
+Hubara et al. (I. Hubara, M. Courbariaux, D. Soudry, R. El-Yaniv, and Y. Bengio. Binarized Neural Networks, Advances in Neural Information Processing Systems 29 (NIPS 2016)) first demonstrated the utility of an algorithm that could use both binarized activations and weights with backpropagation, by keeping track of real valued weights which are mapped to the binarized weights. In the backwards pass the real valued weights are updated according to a heuristic named the "Straight Through Estimator", whereby the gradient of the loss function with respect to the real weights is set equal to the gradient of the loss function with respect to the binarized weights.
 This implies that the precision of the data type used in the matrix multiplications can be single bit, with the necessary addition of forward and backward element wise mappings between binarized and real valued representations of the matrices.
-This can prove a significant advantage for large models, since the computational cost of Matrix Multiplication scales with the number of elements per dimension, N, as O(N^3) for square matrices, whereas element wise operations scale as O(N^2).
+This could prove a significant advantage for large models, since the computational cost of Matrix Multiplication scales with the number of elements per dimension, N, as O(N^3) for square matrices, whereas corresponding element wise operations scale as O(N^2).
 Further algorithms based on this binarized approach have been proposed, e.g. see Rastegari et al. (M. Rastegari, V Ordonez, J. Redmon, and A. Farhadi. Computer Vision – ECCV 2016, 525-542) who have made a comparison between a binarized version of a CNN and corresponding full precision models, for both the accuracy and performance of image classification using the ImageNet data set.
 
 For a discussion of how bitwise MADs can be efficiently leveraged on current Nvidia® hardware see (A. Li, and S. Su. IEEE Transactions on Parallel and Distributed Systems, 32(7):1878-1891, 2021).

From ecb8e6bc1f75d7a3599c8b894d9b02a163b2842a Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 18 Feb 2022 11:32:53 +0000
Subject: [PATCH 16/16] Updated BMAD description.

---
 .../Matrix/dpcpp-joint-matrix.asciidoc        | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
index 1e3085ee8f0e..5ab2ba094601 100644
--- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
+++ b/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc
@@ -256,9 +256,9 @@ joint_matrix_bmad(
 }
 ```
 
-Bitwise Multiply and Add (BMAD) operations replace the usual dot product between a row of matrix A (M by K) with a column of matrix B (K by N).  Instead, a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a K bit row of matrix A with the ith bit of a K bit column of matrix B, to produce a 128 bit intermediate output.
-The Population Count (popc) operator then operates on this intermediate output and the result is added with the (M, N)th element of the accumulator matrix C. Currently only the shape M = 8, N = 8, K = 128 is supported.
-The most important difference with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
+Bitwise Multiply and Add (BMAD) operations replace the usual dot product between a row of matrix A (M by K) with a column of matrix B (K by N), where the programmer can construct e.g. a standard C++ (M by K) array of specified type T to represent matrix A.  Instead, a sequence of logical operations are performed: The AND or XOR logical operations operate on the ith bit of a (K * 32) bit row of matrix A with the ith bit of a (K * 32) bit column of matrix B, to produce a 128 bit intermediate output.
+The Population Count (popc) operator then operates on this intermediate output and the result is added with the (M, N)th element of the accumulator matrix C. Currently only the shape M = 8, N = 8, K = 4 (K = 4 corresponds to 128 single-bit matrix elements) is supported.
+An important difference with respect to the joint_matrix_mad interface is the addition of the `BinaryOperator Op`  parameter. `Op` may be either:
 
 `sycl::bit_and<uint32_t>()`
 
@@ -279,9 +279,9 @@ Single-bit MADs can be used as part of Binarized Neural Networks (BNNs) in the c
 Hubara et al. (I. Hubara, M. Courbariaux, D. Soudry, R. El-Yaniv, and Y. Bengio. Binarized Neural Networks, Advances in Neural Information Processing Systems 29 (NIPS 2016)) first demonstrated the utility of an algorithm that could use both binarized activations and weights with backpropagation, by keeping track of real valued weights which are mapped to the binarized weights. In the backwards pass the real valued weights are updated according to a heuristic named the "Straight Through Estimator", whereby the gradient of the loss function with respect to the real weights is set equal to the gradient of the loss function with respect to the binarized weights.
 This implies that the precision of the data type used in the matrix multiplications can be single bit, with the necessary addition of forward and backward element wise mappings between binarized and real valued representations of the matrices.
 This could prove a significant advantage for large models, since the computational cost of Matrix Multiplication scales with the number of elements per dimension, N, as O(N^3) for square matrices, whereas corresponding element wise operations scale as O(N^2).
-Further algorithms based on this binarized approach have been proposed, e.g. see Rastegari et al. (M. Rastegari, V Ordonez, J. Redmon, and A. Farhadi. Computer Vision – ECCV 2016, 525-542) who have made a comparison between a binarized version of a CNN and corresponding full precision models, for both the accuracy and performance of image classification using the ImageNet data set.
+Further algorithms based on this binarized approach have been proposed, e.g. see Rastegari et al. (M. Rastegari, V Ordonez, J. Redmon, and A. Farhadi. Computer Vision – ECCV 2016, 525-542) who have made a comparison between a binarized version of a CNN (Using a XNOR Binary Dot Product) and corresponding full precision models, for both the accuracy and performance of image classification using the ImageNet data set.
 
-For a discussion of how bitwise MADs can be efficiently leveraged on current Nvidia® hardware see (A. Li, and S. Su. IEEE Transactions on Parallel and Distributed Systems, 32(7):1878-1891, 2021).
+For an example of how bitwise MADs can be leveraged on current Nvidia® hardware see (A. Li, and S. Su. IEEE Transactions on Parallel and Distributed Systems, 32(7):1878-1891, 2021).
 
 #### Example using bitwise operations with `joint_matrix_bmad`
 
@@ -297,18 +297,18 @@ queue q;
     range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
     range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
     cgh.parallel_for<KernelName<M, K, N, BinaryOperation>>(
-        nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+        nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
           sycl::sub_group sg = item.get_sub_group();
           const auto m = item.get_group().get_id()[0]; // row id of current submatrix of BIG C matrix
           const auto n = item.get_group().get_id()[1]; // column id of current submatrix of BIG C matrix
-          joint_matrix<uint32_t, matrix_use::a, 8, 128, matrix_layout::row_major> sub_a;
-          joint_matrix<uint32_t, matrix_use::b, 128, 8, matrix_layout::col_major> sub_b;
+          joint_matrix<uint32_t, matrix_use::a, 8, 4, matrix_layout::row_major> sub_a;
+          joint_matrix<uint32_t, matrix_use::b, 4, 8, matrix_layout::col_major> sub_b;
           joint_matrix<int32_t, matrix_use::accumulator, 8, 8, matrix_layout::row_major> sub_c;
           joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
           for (int k = 0; k < Sub_Tiles_K; k++) // row/col id of current submatrix of BIG A/B matrices
           {
-            joint_matrix_load(sg, sub_a, accA.get_pointer() + (k * K / 32) + (m * M * Big_K / 32), Big_K);
-            joint_matrix_load(sg, sub_b, accB.get_pointer() + (n * N * Big_K / 32) + (k * K / 32), Big_K);
+            joint_matrix_load(sg, sub_a, accA.get_pointer() + (k * K) + (m * M * Big_K), Big_K);
+            joint_matrix_load(sg, sub_b, accB.get_pointer() + (n * N * Big_K) + (k * K), Big_K);
             sub_c = joint_matrix_bmad(sg, sub_a, sub_b, sub_c, Op);
           }
           joint_matrix_store(sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);