From 64968c1e366b0b0b25e616e150c47cfd1f25200f Mon Sep 17 00:00:00 2001 From: Kai Yu Chen Date: Wed, 16 Sep 2020 10:42:34 -0700 Subject: [PATCH 1/4] [SYCL][ESIMD] Update ESIMD tests and add raw send support. --- .../esimd/detail/esimd_memory_intrin.hpp | 304 ++++++++++++++++++ .../CL/sycl/INTEL/esimd/esimd_math.hpp | 6 +- .../CL/sycl/INTEL/esimd/esimd_memory.hpp | 210 ++++++++++++ sycl/test/esimd/on-device/Stencil.cpp | 188 +++++++++++ .../esimd/on-device/histogram_raw_send.cpp | 249 ++++++++++++++ sycl/test/esimd/on-device/reduction.cpp | 142 ++++++++ sycl/test/esimd/on-device/stencil2.cpp | 190 +++++++++++ sycl/test/esimd/on-device/vadd_raw_send.cpp | 152 +++++++++ 8 files changed, 1438 insertions(+), 3 deletions(-) create mode 100644 sycl/test/esimd/on-device/Stencil.cpp create mode 100644 sycl/test/esimd/on-device/histogram_raw_send.cpp create mode 100644 sycl/test/esimd/on-device/reduction.cpp create mode 100644 sycl/test/esimd/on-device/stencil2.cpp create mode 100644 sycl/test/esimd/on-device/vadd_raw_send.cpp diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp index 0c53470712ee6..c43504d8816cd 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp @@ -227,6 +227,151 @@ __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y, sycl::INTEL::gpu::vector_type_t vals); +/// \brief esimd_get_value +/// +/// @param sid the SYCL accessor. +/// +/// Returns the binding table index value. +/// +template +SYCL_EXTERNAL uint32_t __esimd_get_value(SurfIndAliasTy sid); + +/// \brief Raw sends load. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +/// @param msgDst the destination operand of send message. +/// +/// Returns a simd vector of type Ty1 and size N1. +/// +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_raw_sends_load(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, + uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgSrc1, + sycl::INTEL::gpu::vector_type_t msgDst); + +/// \brief Raw send load. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgDst the destination operand of send message. +/// +/// Returns a simd vector of type Ty1 and size N1. +/// +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_raw_send_load(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numDst, uint8_t sfid, + uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgDst); + +/// \brief Raw sends store. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +template +SYCL_EXTERNAL void +__esimd_raw_sends_store(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid, + uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgSrc1); + +/// \brief Raw send store. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +template +SYCL_EXTERNAL void +__esimd_raw_send_store(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t sfid, uint32_t exDesc, + uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0); #ifndef __SYCL_DEVICE_ONLY__ template +SYCL_EXTERNAL uint32_t __esimd_get_value(AccessorTy acc) { + throw cl::sycl::feature_not_supported(); + return 0; +} + +/// \brief Raw sends load. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +/// @param msgDst the destination operand of send message. +/// +/// Returns a simd vector of type Ty1 and size N1. +/// +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_raw_sends_load(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, + uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgSrc1, + sycl::INTEL::gpu::vector_type_t msgDst) { + throw cl::sycl::feature_not_supported(); + return 0; +} + +/// \brief Raw send load. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgDst the destination operand of send message. +/// +/// Returns a simd vector of type Ty1 and size N1. +/// +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_raw_send_load(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numDst, uint8_t sfid, + uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgDst) { + throw cl::sycl::feature_not_supported(); + return 0; +} + +/// \brief Raw sends store. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +template +SYCL_EXTERNAL void +__esimd_raw_sends_store(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid, + uint32_t exDesc, uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0, + sycl::INTEL::gpu::vector_type_t msgSrc1) { + throw cl::sycl::feature_not_supported(); +} + +/// \brief Raw send store. +/// +/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param pred the predicate to specify enabled channels. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param msgSrc0 the first source operand of send message. +/// +template +SYCL_EXTERNAL void +__esimd_raw_send_store(uint8_t modifier, uint8_t execSize, + sycl::INTEL::gpu::vector_type_t pred, + uint8_t numSrc0, uint8_t sfid, uint32_t exDesc, + uint32_t msgDesc, + sycl::INTEL::gpu::vector_type_t msgSrc0) { + throw cl::sycl::feature_not_supported(); +} + #endif // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp index 1f241c63745cb..098c387cb21b8 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp @@ -1838,7 +1838,7 @@ template struct esimd_apply_reduced_min { template class OpType> -T1 esimd_reduce_single(simd v) { +T0 esimd_reduce_single(simd v) { if constexpr (SZ == 1) { return v[0]; } else { @@ -1854,7 +1854,7 @@ T1 esimd_reduce_single(simd v) { template class OpType> -T1 esimd_reduce_pair(simd v1, simd v2) { +T0 esimd_reduce_pair(simd v1, simd v2) { if constexpr (N1 == N2) { simd tmp = OpType()(v1, v2); return esimd_reduce_single(tmp); @@ -1879,7 +1879,7 @@ T1 esimd_reduce_pair(simd v1, simd v2) { template class OpType> -T1 esimd_reduce(simd v) { +T0 esimd_reduce(simd v) { constexpr bool isPowerOf2 = __esimd::isPowerOf2(SZ); if constexpr (isPowerOf2) { return esimd_reduce_single(v); diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp index 8925f858b1fea..5f5686f7bd992 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp @@ -643,6 +643,216 @@ media_block_store(AccessorTy acc, unsigned x, unsigned y, simd vals) { SYCL_EXTERNAL void slm_init(uint32_t size) {} #endif + +/// \brief esimd_get_value +/// +/// @param acc the SYCL accessor. +/// +/// Returns the binding table index value. +/// +template +ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { +#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__) + return __esimd_get_value(AccessorPrivateProxy::getNativeImageObj(acc)); +#else + return __esimd_get_value(acc); +#endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__ +} + +/// \brief Raw sends load. +/// +/// @param msgDst the old value of the destination operand. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param isEOT the flag that indicates whether this is an EOT message, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param isSendc the flag that indicates whether sendc should be used, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param mask the predicate to specify enabled channels (optional - default to +/// on). +/// +/// Returns a simd vector of type T1 and size n1. +/// +/// Raw send APIs are used to implement the send messages on Intel� processor +/// graphics. +/// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf +/// +template +ESIMD_INLINE ESIMD_NODEBUG simd +esimd_raw_sends_load(simd msgDst, simd msgSrc0, + simd msgSrc1, uint32_t exDesc, uint32_t msgDesc, + uint8_t execSize, uint8_t sfid, uint8_t numSrc0, + uint8_t numSrc1, uint8_t numDst, uint8_t isEOT = 0, + uint8_t isSendc = 0, simd mask = 1) { + constexpr unsigned _Width1 = n1 * sizeof(T1); + static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar"); + constexpr unsigned _Width2 = n2 * sizeof(T2); + static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0"); + constexpr unsigned _Width3 = n3 * sizeof(T3); + static_assert(_Width3 % 32 == 0, "Invalid size for raw send msgSrc1"); + + uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); + return __esimd_raw_sends_load( + modifier, execSize, mask.data(), numSrc0, numSrc1, numDst, sfid, exDesc, + msgDesc, msgSrc0.data(), msgSrc1.data(), msgDst.data()); +} + +/// \brief Raw send load. +/// +/// @param msgDst the old value of the destination operand. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numDst the number of GRFs for destination, which must be a compile +/// time constant. +/// +/// @param isEOT the flag that indicates whether this is an EOT message, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param isSendc the flag that indicates whether sendc should be used, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param mask the predicate to specify enabled channels (optional - default to +/// on). +/// +/// Returns a simd vector of type T1 and size n1. +/// +template +ESIMD_INLINE ESIMD_NODEBUG simd +esimd_raw_send_load(simd msgDst, simd msgSrc0, uint32_t exDesc, + uint32_t msgDesc, uint8_t execSize, uint8_t sfid, + uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, + uint8_t isSendc = 0, simd mask = 1) { + constexpr unsigned _Width1 = n1 * sizeof(T1); + static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar"); + constexpr unsigned _Width2 = n2 * sizeof(T2); + static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0"); + + uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); + return __esimd_raw_send_load( + modifier, execSize, mask.data(), numSrc0, numDst, sfid, exDesc, msgDesc, + msgSrc0.data(), msgDst.data()); +} + +/// \brief Raw sends store. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param msgSrc1 the second source operand of send message. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param numSrc1 the number of GRFs for source-1, which must be a compile time +/// constant. +/// +/// @param isEOT the flag that indicates whether this is an EOT message, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param isSendc the flag that indicates whether sendc should be used, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param mask the predicate to specify enabled channels (optional - default to +/// on). +/// +template +ESIMD_INLINE ESIMD_NODEBUG void +esimd_raw_sends_store(simd msgSrc0, simd msgSrc1, + uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, + uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, + uint8_t isEOT = 0, uint8_t isSendc = 0, + simd mask = 1) { + constexpr unsigned _Width1 = n1 * sizeof(T1); + static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0"); + constexpr unsigned _Width2 = n2 * sizeof(T2); + static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc1"); + + uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); + __esimd_raw_sends_store( + modifier, execSize, mask.data(), numSrc0, numSrc1, sfid, exDesc, msgDesc, + msgSrc0.data(), msgSrc1.data()); +} + +/// \brief Raw send store. +/// +/// @param msgSrc0 the first source operand of send message. +/// +/// @param exDesc the extended message descriptor. +/// +/// @param msgDesc the message descriptor. +/// +/// @param execSize the execution size, which must be a compile time constant. +/// +/// @param sfid the shared function ID, which must be a compile time constant. +/// +/// @param numSrc0 the number of GRFs for source-0, which must be a compile time +/// constant. +/// +/// @param isEOT the flag that indicates whether this is an EOT message, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param isSendc the flag that indicates whether sendc should be used, which +/// must be a compile time constant (optional - default to 0). +/// +/// @param mask the predicate to specify enabled channels (optional - default to +/// on). +/// +template +ESIMD_INLINE ESIMD_NODEBUG void +esimd_raw_send_store(simd msgSrc0, uint32_t exDesc, uint32_t msgDesc, + uint8_t execSize, uint8_t sfid, uint8_t numSrc0, + uint8_t isEOT = 0, uint8_t isSendc = 0, + simd mask = 1) { + constexpr unsigned _Width1 = n1 * sizeof(T1); + static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0"); + + uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); + __esimd_raw_send_store(modifier, execSize, mask.data(), numSrc0, + sfid, exDesc, msgDesc, msgSrc0.data()); +} + } // namespace gpu } // namespace INTEL } // namespace sycl diff --git a/sycl/test/esimd/on-device/Stencil.cpp b/sycl/test/esimd/on-device/Stencil.cpp new file mode 100644 index 0000000000000..75d71d304811a --- /dev/null +++ b/sycl/test/esimd/on-device/Stencil.cpp @@ -0,0 +1,188 @@ +//==---------------- stencil.cpp - DPC++ ESIMD on-device test ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// TODO enable on Windows +// REQUIRES: linux +// REQUIRES: gpu +// RUN: %clangxx-esimd -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +// +// test smaller input size +// test 8x16 block size +// +#define DIM_SIZE (1 << 10) +#define SQUARE_SZ (DIM_SIZE * DIM_SIZE) + +#define WIDTH 16 +#define HEIGHT 16 + +using namespace cl::sycl; + +void InitializeSquareMatrix(float *matrix, size_t const Dim, + bool const bSkipDataGeneration) { + memset(matrix, 0, Dim * Dim * sizeof(float)); + if (!bSkipDataGeneration) { + for (unsigned int iRow = 0; iRow < Dim; ++iRow) { + for (unsigned int iCol = 0; iCol < Dim; ++iCol) { + matrix[iRow * Dim + iCol] = static_cast(iRow + iCol); + } + } + } +} + +bool CheckResults(float *out, float *in) { + unsigned int n = DIM_SIZE; + for (unsigned int i = 0; i < n; i++) { + for (unsigned int j = 0; j < n; j++) { + if ((5 <= i) && (i < n - 5) && (5 <= j) && (j < n - 5)) { + float res = +in[(i - 5) * n + (j + 0)] * -0.02f + + in[(i - 4) * n + (j + 0)] * -0.025f + + in[(i - 3) * n + (j + 0)] * -0.0333333333333f + + in[(i - 2) * n + (j + 0)] * -0.05f + + in[(i - 1) * n + (j + 0)] * -0.1f + + in[(i + 0) * n + (j - 5)] * -0.02f + + in[(i + 0) * n + (j - 4)] * -0.025f + + in[(i + 0) * n + (j - 3)] * -0.0333333333333f + + in[(i + 0) * n + (j - 2)] * -0.05f + + in[(i + 0) * n + (j - 1)] * -0.1f + + in[(i + 0) * n + (j + 1)] * 0.1f + + in[(i + 0) * n + (j + 2)] * 0.05f + + in[(i + 0) * n + (j + 3)] * 0.0333333333333f + + in[(i + 0) * n + (j + 4)] * 0.025f + + in[(i + 0) * n + (j + 5)] * 0.02f + + in[(i + 1) * n + (j + 0)] * 0.1f + + in[(i + 2) * n + (j + 0)] * 0.05f + + in[(i + 3) * n + (j + 0)] * 0.0333333333333f + + in[(i + 4) * n + (j + 0)] * 0.025f + + in[(i + 5) * n + (j + 0)] * 0.02f; + + // check result + if (abs(res - out[i * n + j]) >= 0.0015f) { + std::cout << "out[" << i << "][" << j << "] = " << out[i * n + j] + << " expect result " << res << std::endl; + return false; + } + } + } + } + return true; +} + +int main(void) { + uint range_width = + (DIM_SIZE - 10) / WIDTH + (((DIM_SIZE - 10) % WIDTH == 0) ? 0 : 1); + uint range_height = + (DIM_SIZE - 10) / HEIGHT + (((DIM_SIZE - 10) % HEIGHT == 0) ? 0 : 1); + cl::sycl::range<2> GlobalRange{range_width, range_height}; + + std::cout << "width = " << range_width << " height = " << range_height + << std::endl; + cl::sycl::range<2> LocalRange{1, 1}; + + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + auto ctxt = q.get_context(); + + // create and init matrices + float *inputMatrix = + static_cast(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); + float *outputMatrix = + static_cast(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); + InitializeSquareMatrix(inputMatrix, DIM_SIZE, false); + InitializeSquareMatrix(outputMatrix, DIM_SIZE, true); + + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + uint h_pos = it.get_id(0); + uint v_pos = it.get_id(1); + + simd vin; + // matrix HEIGHT+10 x 32 + auto in = vin.format(); + + // + // rather than loading all data in + // the code will interleave data loading and compute + // first, we load enough data for the first 16 pixels + // + unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; +#pragma unroll + for (unsigned i = 0; i < 10; i++) { + in.row(i) = block_load(inputMatrix + off); + off += DIM_SIZE; + } + + unsigned out_off = + (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * + sizeof(float); + simd elm16(0, 1); + +#pragma unroll + for (unsigned i = 0; i < HEIGHT; i++) { + + in.row(10 + i) = block_load(inputMatrix + off); + off += DIM_SIZE; + + simd sum = + in.row(i + 0).select(5) * -0.02f + + in.row(i + 1).select(5) * -0.025f + + in.row(i + 2).select(5) * -0.0333333333333f + + in.row(i + 3).select(5) * -0.05f + + in.row(i + 4).select(5) * -0.1f + + in.row(i + 6).select(5) * 0.1f + + in.row(i + 7).select(5) * 0.05f + + in.row(i + 8).select(5) * 0.0333333333333f + + in.row(i + 9).select(5) * 0.025f + + in.row(i + 10).select(5) * 0.02f + + in.row(i + 5).select(0) * -0.02f + + in.row(i + 5).select(1) * -0.025f + + in.row(i + 5).select(2) * -0.0333333333333f + + in.row(i + 5).select(3) * -0.05f + + in.row(i + 5).select(4) * -0.1f + + in.row(i + 5).select(6) * 0.1f + + in.row(i + 5).select(7) * 0.05f + + in.row(i + 5).select(8) * 0.0333333333333f + + in.row(i + 5).select(9) * 0.025f + + in.row(i + 5).select(10) * 0.02f; + + // predciate output + simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + + simd elm16_off = elm16 * sizeof(float) + out_off; + scatter(outputMatrix, sum, elm16_off, p); + out_off += DIM_SIZE * sizeof(float); + + if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) + break; + } + }); + }); + e.wait(); + + // check result + bool passed = CheckResults(outputMatrix, inputMatrix); + if (passed) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + } + free(inputMatrix, ctxt); + free(outputMatrix, ctxt); + return 0; +} diff --git a/sycl/test/esimd/on-device/histogram_raw_send.cpp b/sycl/test/esimd/on-device/histogram_raw_send.cpp new file mode 100644 index 0000000000000..b094ef426ec81 --- /dev/null +++ b/sycl/test/esimd/on-device/histogram_raw_send.cpp @@ -0,0 +1,249 @@ +//==------------ histogram_raw_send.cpp - DPC++ ESIMD on-device test +//-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// TODO enable on Windows +// REQUIRES: linux +// REQUIRES: gpu +// RUN: %clangxx-esimd -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include +#include + +using namespace cl::sycl; + +#define NUM_BINS 256 +#define IMG_WIDTH 1024 +#define IMG_HEIGHT 1024 +// +// each parallel_for handles 64x32 bytes +// +#define BLOCK_WIDTH 32 +#define BLOCK_HEIGHT 64 + +void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY, + unsigned int *cpuHistogram) { + int i; + for (i = 0; i < width * height; i++) { + cpuHistogram[srcY[i]] += 1; + } +} + +void writeHist(unsigned int *hist) { + int total = 0; + + std::cerr << "\nHistogram: \n"; + for (int i = 0; i < NUM_BINS; i += 8) { + std::cerr << "\n [" << i << " - " << i + 7 << "]:"; + for (int j = 0; j < 8; j++) { + std::cerr << "\t" << hist[i + j]; + total += hist[i + j]; + } + } + std::cerr << "\nTotal = " << total << " \n"; +} + +int checkHistogram(unsigned int *refHistogram, unsigned int *hist) { + + for (int i = 0; i < NUM_BINS; i++) { + if (refHistogram[i] != hist[i]) { + return 0; + } + } + return 1; +} + +using namespace sycl::INTEL::gpu; +template +ESIMD_INLINE void atomic_write(T *bins, simd offset, + simd src0, simd pred) { + simd oldDst; + simd vAddr(reinterpret_cast(bins)); + simd vOffset = convert(offset); + vAddr += vOffset; + + uint32_t exDesc = 0x4C; + uint32_t desc = 0x414A7FF; + constexpr uint8_t execSize = 0x83; + constexpr uint8_t sfid = 0x1; + constexpr uint8_t numDst = 0x1; + constexpr uint8_t numSrc0 = 0x2; + constexpr uint8_t numSrc1 = 0x1; + constexpr uint8_t isEOT = 0; + constexpr uint8_t isSendc = 0; + + esimd_raw_sends_load(oldDst, vAddr, src0, exDesc, desc, execSize, sfid, + numSrc0, numSrc1, numDst, isEOT, isSendc, pred); +} + +int main(int argc, char *argv[]) { + + const char *input_file = nullptr; + unsigned int width = IMG_WIDTH * sizeof(unsigned int); + unsigned int height = IMG_HEIGHT; + + if (argc == 2) { + input_file = argv[1]; + } else { + std::cerr << "Usage: Histogram.exe input_file" << std::endl; + std::cerr << "No input file specificed. Use default random value ...." + << std::endl; + } + + // ------------------------------------------------------------------------ + // Read in image luma plane + + // Allocate Input Buffer + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + auto ctxt = q.get_context(); + unsigned char *srcY = + static_cast(malloc_shared(width * height, dev, ctxt)); + unsigned int *bins = static_cast( + malloc_shared(NUM_BINS * sizeof(unsigned int), dev, ctxt)); + std::cout << "Running on " << dev.get_info() << "\n"; + + uint range_width = width / BLOCK_WIDTH; + uint range_height = height / BLOCK_HEIGHT; + + if (srcY == NULL) { + std::cerr << "Out of memory\n"; + exit(1); + } + + // Initializes input. + unsigned int input_size = width * height; + std::cerr << "Processing inputs\n"; + + if (input_file != nullptr) { + FILE *f = fopen(input_file, "rb"); + if (f == NULL) { + std::cerr << "Error opening file " << input_file; + std::exit(1); + } + + unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f); + if (cnt != input_size) { + std::cerr << "Error reading input from " << input_file; + std::exit(1); + } + } else { + srand(2009); + for (int i = 0; i < input_size; ++i) { + srcY[i] = rand() % 256; + } + } + + for (int i = 0; i < NUM_BINS; i++) { + bins[i] = 0; + } + + // ------------------------------------------------------------------------ + // CPU Execution: + + unsigned int cpuHistogram[NUM_BINS]; + memset(cpuHistogram, 0, sizeof(cpuHistogram)); + histogram_CPU(width, height, srcY, cpuHistogram); + + cl::sycl::image<2> Img(srcY, image_channel_order::rgba, + image_channel_type::unsigned_int32, + range<2>{width / sizeof(uint4), height}); + + { + // create ranges + // We need that many task groups + auto GlobalRange = range<1>(range_width * range_height); + // We need that many tasks in each group + auto LocalRange = range<1>(1); + nd_range<1> Range(GlobalRange, LocalRange); + + auto e = q.submit([&](handler &cgh) { + auto readAcc = Img.get_access(cgh); + + cgh.parallel_for( + Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { + // Get thread origin offsets + uint tid = ndi.get_group(0); + uint h_pos = (tid % range_width) * BLOCK_WIDTH; + uint v_pos = (tid / range_width) * BLOCK_HEIGHT; + + // Declare a 8x32 uchar matrix to store the input block pixel value + simd in; + + // Declare a vector to store the local histogram + simd histogram(0); + + // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block + for (int y = 0; y < BLOCK_HEIGHT / 8; y++) { + // Perform 2D media block read to load 8x32 pixel block + in = + media_block_load(readAcc, h_pos, v_pos); + + // Accumulate local histogram for each pixel value +#pragma unroll + for (int i = 0; i < 8; i++) { +#pragma unroll + for (int j = 0; j < 32; j++) { + histogram.select<1, 1>(in[i * 32 + j]) += 1; + } + } + + // Update starting offset for the next work block + v_pos += 8; + } + + // Declare a vector to store the offset for atomic write operation + simd offset(0, 1); // init to 0, 1, 2, ..., 7 + offset *= sizeof(unsigned int); + + // Update global sum by atomically adding each local histogram +#pragma unroll + for (int i = 0; i < NUM_BINS; i += 8) { + // Declare a vector to store the source for atomic write operation + simd src; + src = histogram.select<8, 1>(i); + +#ifdef __SYCL_DEVICE_ONLY__ + // flat_atomic(bins, offset, src, 1); + atomic_write( + bins, offset, src, 1); + offset += 8 * sizeof(unsigned int); +#else + auto vals = block_load(bins + i); + vals = vals + src; + block_store(bins + i, vals); +#endif + } + }); + }); + e.wait(); + + // SYCL will enqueue and run the kernel. Recall that the buffer's data is + // given back to the host at the end of scope. + } // make sure data is given back to the host at the end of this scope + + writeHist(bins); + writeHist(cpuHistogram); + // Checking Histogram + if (checkHistogram(cpuHistogram, bins)) { + std::cerr << "PASSED\n"; + return 0; + } else { + std::cerr << "FAILED\n"; + return 1; + } + + return 0; +} diff --git a/sycl/test/esimd/on-device/reduction.cpp b/sycl/test/esimd/on-device/reduction.cpp new file mode 100644 index 0000000000000..bfa4c196a5623 --- /dev/null +++ b/sycl/test/esimd/on-device/reduction.cpp @@ -0,0 +1,142 @@ +//==---------------- reduction.cpp - DPC++ ESIMD on-device test +//------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// TODO enable on Windows +// REQUIRES: linux +// REQUIRES: gpu +// RUN: %clangxx-esimd -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; + +typedef short TYPE; + +int main(void) { + constexpr unsigned InputSize = 32; + constexpr unsigned OutputSize = 4; + constexpr unsigned VL = 32; + constexpr unsigned GroupSize = 1; + + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + auto ctxt = q.get_context(); + TYPE *A = + static_cast(malloc_shared(InputSize * sizeof(TYPE), dev, ctxt)); + int *B = + static_cast(malloc_shared(OutputSize * sizeof(int), dev, ctxt)); + + for (unsigned i = 0; i < InputSize; ++i) { + if (i == 19) { + A[i] = 32767; + } else { + A[i] = i; + } + } + + { + cl::sycl::range<1> GroupRange{InputSize / VL}; + cl::sycl::range<1> TaskRange{GroupSize}; + cl::sycl::nd_range<1> Range(GroupRange, TaskRange); + + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GroupRange * TaskRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + + simd va = block_load(A + i * VL); + simd vb; + + vb.select<1, 0>(0) = reduce(va, std::plus<>()); + vb.select<1, 0>(1) = reduce(va, std::multiplies<>()); + vb.select<1, 0>(2) = hmax(va); + vb.select<1, 0>(3) = hmin(va); + + block_store(B + i * VL, vb); + }); + }); + e.wait(); + } + + auto compute_reduce_sum = [](TYPE A[InputSize]) -> int { + int retv = A[0]; + for (int i = 1; i < InputSize; i++) { + retv += A[i]; + } + return retv; + }; + + auto compute_reduce_prod = [](TYPE A[InputSize]) -> int { + int retv = A[0]; + for (int i = 1; i < InputSize; i++) { + retv *= A[i]; + } + return retv; + }; + + auto compute_reduce_max = [](TYPE A[InputSize]) -> int { + int retv = A[0]; + for (int i = 1; i < InputSize; i++) { + if (A[i] > retv) { + retv = A[i]; + } + } + return retv; + }; + + auto compute_reduce_min = [](TYPE A[InputSize]) -> int { + int retv = A[0]; + for (int i = 1; i < InputSize; i++) { + if (A[i] < retv) { + retv = A[i]; + } + } + return retv; + }; + + bool TestPass = true; + int ref = compute_reduce_sum(A); + if (B[0] != ref) { + std::cout << "Incorrect sum " << B[0] << ", expected " << ref << "\n"; + TestPass = false; + } + + ref = compute_reduce_prod(A); + if (B[1] != ref) { + std::cout << "Incorrect prod " << B[1] << ", expected " << ref << "\n"; + TestPass = false; + } + + ref = compute_reduce_max(A); + if (B[2] != ref) { + std::cout << "Incorrect max " << B[2] << ", expected " << ref << "\n"; + TestPass = false; + } + + ref = compute_reduce_min(A); + if (B[3] != ref) { + std::cout << "Incorrect min " << B[3] << ", expected " << ref << "\n"; + TestPass = false; + } + + if (!TestPass) { + std::cout << "Failed\n"; + return 1; + } + + std::cout << "Passed\n"; + return 0; +} diff --git a/sycl/test/esimd/on-device/stencil2.cpp b/sycl/test/esimd/on-device/stencil2.cpp new file mode 100644 index 0000000000000..190968aa8a336 --- /dev/null +++ b/sycl/test/esimd/on-device/stencil2.cpp @@ -0,0 +1,190 @@ +//==---------------- stencil2.cpp - DPC++ ESIMD on-device test ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// TODO enable on Windows +// REQUIRES: linux +// REQUIRES: gpu +// RUN: %clangxx-esimd -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +// +// test smaller input size +// test 8x16 block size +// +#define DIM_SIZE (1 << 10) +#define SQUARE_SZ (DIM_SIZE * DIM_SIZE) + +#define WIDTH 16 +#define HEIGHT 16 + +#define GET_IDX(row, col) ((row)*32 + col) + +using namespace cl::sycl; + +void InitializeSquareMatrix(float *matrix, size_t const Dim, + bool const bSkipDataGeneration) { + memset(matrix, 0, Dim * Dim * sizeof(float)); + if (!bSkipDataGeneration) { + for (unsigned int iRow = 0; iRow < Dim; ++iRow) { + for (unsigned int iCol = 0; iCol < Dim; ++iCol) { + matrix[iRow * Dim + iCol] = static_cast(iRow + iCol); + } + } + } +} + +bool CheckResults(float *out, float *in) { + unsigned int n = DIM_SIZE; + for (unsigned int i = 0; i < n; i++) { + for (unsigned int j = 0; j < n; j++) { + if ((5 <= i) && (i < n - 5) && (5 <= j) && (j < n - 5)) { + float res = +in[(i - 5) * n + (j + 0)] * -0.02f + + in[(i - 4) * n + (j + 0)] * -0.025f + + in[(i - 3) * n + (j + 0)] * -0.0333333333333f + + in[(i - 2) * n + (j + 0)] * -0.05f + + in[(i - 1) * n + (j + 0)] * -0.1f + + in[(i + 0) * n + (j - 5)] * -0.02f + + in[(i + 0) * n + (j - 4)] * -0.025f + + in[(i + 0) * n + (j - 3)] * -0.0333333333333f + + in[(i + 0) * n + (j - 2)] * -0.05f + + in[(i + 0) * n + (j - 1)] * -0.1f + + in[(i + 0) * n + (j + 1)] * 0.1f + + in[(i + 0) * n + (j + 2)] * 0.05f + + in[(i + 0) * n + (j + 3)] * 0.0333333333333f + + in[(i + 0) * n + (j + 4)] * 0.025f + + in[(i + 0) * n + (j + 5)] * 0.02f + + in[(i + 1) * n + (j + 0)] * 0.1f + + in[(i + 2) * n + (j + 0)] * 0.05f + + in[(i + 3) * n + (j + 0)] * 0.0333333333333f + + in[(i + 4) * n + (j + 0)] * 0.025f + + in[(i + 5) * n + (j + 0)] * 0.02f; + + // check result + if (abs(res - out[i * n + j]) >= 0.0015f) { + std::cout << "out[" << i << "][" << j << "] = " << out[i * n + j] + << " expect result " << res << std::endl; + return false; + } + } + } + } + return true; +} + +int main(void) { + uint range_width = + (DIM_SIZE - 10) / WIDTH + (((DIM_SIZE - 10) % WIDTH == 0) ? 0 : 1); + uint range_height = + (DIM_SIZE - 10) / HEIGHT + (((DIM_SIZE - 10) % HEIGHT == 0) ? 0 : 1); + cl::sycl::range<2> GlobalRange{range_width, range_height}; + + std::cout << "width = " << range_width << " height = " << range_height + << std::endl; + cl::sycl::range<2> LocalRange{1, 1}; + + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + auto ctxt = q.get_context(); + + // create and init matrices + float *inputMatrix = + static_cast(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); + float *outputMatrix = + static_cast(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); + InitializeSquareMatrix(inputMatrix, DIM_SIZE, false); + InitializeSquareMatrix(outputMatrix, DIM_SIZE, true); + + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + uint h_pos = it.get_id(0); + uint v_pos = it.get_id(1); + + simd vin; + // matrix HEIGHT+10 x 32 + auto in = vin.format(); + + // + // rather than loading all data in + // the code will interleave data loading and compute + // first, we load enough data for the first 16 pixels + // + unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; +#pragma unroll + for (unsigned i = 0; i < 10; i++) { + in.row(i) = block_load(inputMatrix + off); + off += DIM_SIZE; + } + + unsigned out_off = + (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * + sizeof(float); + simd elm16(0, 1); + +#pragma unroll + for (unsigned i = 0; i < HEIGHT; i++) { + + in.row(10 + i) = block_load(inputMatrix + off); + off += DIM_SIZE; + + simd sum = + vin.select(GET_IDX(i, 5)) * -0.02f + + vin.select(GET_IDX(i + 1, 5)) * -0.025f + + vin.select(GET_IDX(i + 2, 5)) * -0.0333333333333f + + vin.select(GET_IDX(i + 3, 5)) * -0.05f + + vin.select(GET_IDX(i + 4, 5)) * -0.1f + + vin.select(GET_IDX(i + 6, 5)) * 0.1f + + vin.select(GET_IDX(i + 7, 5)) * 0.05f + + vin.select(GET_IDX(i + 8, 5)) * 0.0333333333333f + + vin.select(GET_IDX(i + 9, 5)) * 0.025f + + vin.select(GET_IDX(i + 10, 5)) * 0.02f + + vin.select(GET_IDX(i + 5, 0)) * -0.02f + + vin.select(GET_IDX(i + 5, 1)) * -0.025f + + vin.select(GET_IDX(i + 5, 2)) * -0.0333333333333f + + vin.select(GET_IDX(i + 5, 3)) * -0.05f + + vin.select(GET_IDX(i + 5, 4)) * -0.1f + + vin.select(GET_IDX(i + 5, 6)) * 0.1f + + vin.select(GET_IDX(i + 5, 7)) * 0.05f + + vin.select(GET_IDX(i + 5, 8)) * 0.0333333333333f + + vin.select(GET_IDX(i + 5, 9)) * 0.025f + + vin.select(GET_IDX(i + 5, 10)) * 0.02f; + + // predciate output + simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + + simd elm16_off = elm16 * sizeof(float) + out_off; + scatter(outputMatrix, sum, elm16_off, p); + out_off += DIM_SIZE * sizeof(float); + + if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) + break; + } + }); + }); + e.wait(); + + // check result + bool passed = CheckResults(outputMatrix, inputMatrix); + if (passed) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + } + free(inputMatrix, ctxt); + free(outputMatrix, ctxt); + return 0; +} diff --git a/sycl/test/esimd/on-device/vadd_raw_send.cpp b/sycl/test/esimd/on-device/vadd_raw_send.cpp new file mode 100644 index 0000000000000..e21d94fae353c --- /dev/null +++ b/sycl/test/esimd/on-device/vadd_raw_send.cpp @@ -0,0 +1,152 @@ +//==---------------- vadd_raw_send.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// TODO enable on Windows +// REQUIRES: linux +// REQUIRES: gpu +// RUN: %clangxx-esimd -fsycl %s -o %t.out +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; + +using namespace sycl::INTEL::gpu; + +template +ESIMD_INLINE simd dwaligned_block_read(AccessorTy acc, + unsigned int offset) { + simd src0; + simd oldDst; + + src0.select<1, 1>(2) = offset; + uint32_t exDesc = 0xA; + uint32_t desc = esimd_get_value(acc); + desc += 0x2284300; + constexpr uint8_t execSize = 0x84; + constexpr uint8_t sfid = 0x0; + constexpr uint8_t numSrc0 = 0x1; + constexpr uint8_t numDst = 0x2; + + return esimd_raw_send_load(oldDst, src0, exDesc, desc, execSize, sfid, + numSrc0, numDst); +} + +template +ESIMD_INLINE void block_write1(AccessorTy acc, unsigned int offset, + simd data) { + simd src0; + + src0.template select<1, 1>(2) = offset >> 4; + uint32_t exDesc = 0x4A; + uint32_t desc = esimd_get_value(acc); + desc += 0x20A0200; + constexpr uint8_t execSize = 0x83; + constexpr uint8_t sfid = 0x0; + constexpr uint8_t numSrc0 = 0x1; + constexpr uint8_t numSrc1 = 0x1; + + return esimd_raw_sends_store(src0, data, exDesc, desc, execSize, sfid, + numSrc0, numSrc1); +} + +template +ESIMD_INLINE void block_write2(AccessorTy acc, unsigned int offset, + simd data) { + simd src0; + auto src0_ref1 = + src0.template select<8, 1>(0).template format(); + auto src0_ref2 = src0.template select<8, 1>(8); + + src0_ref1.template select<1, 1>(2) = offset >> 4; + src0_ref2 = data; + uint32_t exDesc = 0xA; + uint32_t desc = esimd_get_value(acc); + desc += 0x40A0200; + constexpr uint8_t execSize = 0x83; + constexpr uint8_t sfid = 0x0; + constexpr uint8_t numSrc0 = 0x2; + + return esimd_raw_send_store(src0, exDesc, desc, execSize, sfid, numSrc0); +} + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + cl::sycl::range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + cl::sycl::range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + unsigned int offset = i * VL * sizeof(float); + simd va = dwaligned_block_read(PA, offset); + simd vb = dwaligned_block_read(PB, offset); + simd vc = va + vb; + constexpr int SIZE = VL / 2; + block_write1(PC, offset, vc.select(0).read()); + offset += SIZE * sizeof(float); + block_write2(PC, offset, vc.select(SIZE).read()); + }); + }); + e.wait(); + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} From 97e7e0314993448f92d43f6f7a03d93f2f19e73e Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Fri, 25 Sep 2020 07:13:41 -0700 Subject: [PATCH 2/4] Apply suggestions from code review --- sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp index 5f5686f7bd992..99752b5e73608 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp @@ -658,6 +658,9 @@ ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { return __esimd_get_value(acc); #endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__ } +/// Raw send APIs below are used to implement the send messages on Intel(R) processor +/// graphics, as defined in the documentation at +/// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf /// \brief Raw sends load. /// @@ -693,11 +696,8 @@ ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { /// @param mask the predicate to specify enabled channels (optional - default to /// on). /// -/// Returns a simd vector of type T1 and size n1. +/// Returns the vector value read from memory. /// -/// Raw send APIs are used to implement the send messages on Intel� processor -/// graphics. -/// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf /// template @@ -749,7 +749,7 @@ esimd_raw_sends_load(simd msgDst, simd msgSrc0, /// @param mask the predicate to specify enabled channels (optional - default to /// on). /// -/// Returns a simd vector of type T1 and size n1. +/// Returns the vector value read from memory. /// template ESIMD_INLINE ESIMD_NODEBUG simd From c8d720cb6a1b6b6f45ace9f5e676cf46b8c7ff2a Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Fri, 25 Sep 2020 07:19:49 -0700 Subject: [PATCH 3/4] fix formatting --- sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp index 99752b5e73608..c4176e5d5080a 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp @@ -658,8 +658,9 @@ ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { return __esimd_get_value(acc); #endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__ } -/// Raw send APIs below are used to implement the send messages on Intel(R) processor -/// graphics, as defined in the documentation at + ++/// Raw send APIs below are used to implement the send messages on Intel(R) ++/// processor graphics, as defined in the documentation at /// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf /// \brief Raw sends load. From 3a43ece49a7fadcb589f8803e39a9a0efa6e651d Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Mon, 28 Sep 2020 02:09:02 -0700 Subject: [PATCH 4/4] Fix typo in comment in esimd_memory.hpp leading to compile error --- sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp index c4176e5d5080a..e159b758c30d6 100644 --- a/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp @@ -659,8 +659,8 @@ ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { #endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__ } -+/// Raw send APIs below are used to implement the send messages on Intel(R) -+/// processor graphics, as defined in the documentation at +/// Raw send APIs below are used to implement the send messages on Intel(R) +/// processor graphics, as defined in the documentation at /// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf /// \brief Raw sends load.