From ee179f108ec8bdea6fbc8004bd44446915791369 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Fri, 4 Mar 2022 15:28:13 +0000 Subject: [PATCH 1/7] Temporarily oneapi test file adds unary cov.. ..erage fixes broken verify_logic test. Once a backend agnostic ext_oneapi_bf16 aspect is added we can add the patches from this file to the existing bfloat16 test. Until then the cuda backend requires the Target provided at compile time to use the Target_Builtins. --- SYCL/BFloat16/bfloat16_type-oneapi.cpp | 230 +++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 SYCL/BFloat16/bfloat16_type-oneapi.cpp diff --git a/SYCL/BFloat16/bfloat16_type-oneapi.cpp b/SYCL/BFloat16/bfloat16_type-oneapi.cpp new file mode 100644 index 0000000000..80f9a96ea2 --- /dev/null +++ b/SYCL/BFloat16/bfloat16_type-oneapi.cpp @@ -0,0 +1,230 @@ +// REQUIRES: gpu, cuda + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out + +//==----------- bfloat16_type_oneapi.cpp - SYCL bfloat16 type test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +using namespace cl::sycl; + +constexpr size_t N = 100; + +template void assert_close(const T &C, const float ref) { + for (size_t i = 0; i < N; i++) { + auto diff = C[i] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} + +void verify_conv_implicit(queue &q, buffer &a, range<1> &r, + const float ref) { + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + A[index] = AVal; + }); + }); + + assert_close(a.get_access(), ref); +} + +void verify_conv_explicit(queue &q, buffer &a, range<1> &r, + const float ref) { + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + uint16_t AVal = + cl::sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]); + A[index] = cl::sycl::ext::oneapi::experimental::bfloat16::to_float(AVal); + }); + }); + + assert_close(a.get_access(), ref); +} + +void verify_add(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_minus(queue &q, buffer &a, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_div(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_logic(queue &q, buffer &a, buffer &b, + range<1> &r, const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { +cl::sycl::ext::oneapi::experimental::bfloat16 bf1(1.0f); +cl::sycl::ext::oneapi::experimental::bfloat16 bf2 = -bf1; +cl::sycl::ext::oneapi::experimental::bfloat16 bf3 = +bf1; +cl::sycl::ext::oneapi::experimental::bfloat16 bf4(+bf1); + + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + if (AVal) { + if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal || + !BVal) { + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = + AVal != BVal ? AVal : BVal; + CVal--; + CVal++; + if (AVal == BVal) { + CVal -= AVal; + CVal *= 3.0; + CVal /= 2.0; + } else + CVal += BVal; + C[index] = CVal; + } + } + + }); + }); + + assert_close(c.get_access(), ref); +} + +int main() { + device dev{default_selector()}; + + // TODO: replace is_gpu check with extension check when the appropriate part + // of implementation ready (aspect) + if (!dev.is_gpu()) { + std::cout << "This device doesn't support bfloat16 conversion feature" + << std::endl; + return 0; + } + + std::vector vec_a(N, 5.0); + std::vector vec_b(N, 2.0); + std::vector vec_b_neg(N, -2.0); + + range<1> r(N); + buffer a{vec_a.data(), r}; + buffer b{vec_b.data(), r}; + buffer b_neg{vec_b_neg.data(), r}; + + queue q{dev}; + + verify_conv_implicit(q, a, r, 5.0); + verify_conv_explicit(q, a, r, 5.0); + verify_add(q, a, b, r, 7.0); + verify_sub(q, a, b, r, 3.0); + verify_mul(q, a, b, r, 10.0); + verify_div(q, a, b, r, 2.5); + verify_logic(q, a, b, r, 7.0); + verify_add(q, a, b_neg, r, 3.0); + verify_sub(q, a, b_neg, r, 7.0); + verify_minus(q, a, r, -5.0); + verify_mul(q, a, b_neg, r, -10.0); + verify_div(q, a, b_neg, r, -2.5); + verify_logic(q, a, b_neg, r, 3.0); + + float fl1 = -1; +float fl2 = +1; + + + return 0; +} From b81fc4a889b435e694863b713705c3f6e04443bf Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Wed, 16 Mar 2022 10:34:34 +0000 Subject: [PATCH 2/7] intel -> oneapi for other backend tests beyond cuda too --- SYCL/BFloat16/bfloat16_type.cpp | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp index ab550856e3..e4d164f77b 100644 --- a/SYCL/BFloat16/bfloat16_type.cpp +++ b/SYCL/BFloat16/bfloat16_type.cpp @@ -16,7 +16,7 @@ //===----------------------------------------------------------------------===// #include -#include +#include #include @@ -37,7 +37,7 @@ void verify_conv_implicit(queue &q, buffer &a, range<1> &r, q.submit([&](handler &cgh) { auto A = a.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; A[index] = AVal; }); }); @@ -51,8 +51,8 @@ void verify_conv_explicit(queue &q, buffer &a, range<1> &r, auto A = a.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { uint16_t AVal = - cl::sycl::ext::intel::experimental::bfloat16::from_float(A[index]); - A[index] = cl::sycl::ext::intel::experimental::bfloat16::to_float(AVal); + cl::sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]); + A[index] = cl::sycl::ext::oneapi::experimental::bfloat16::to_float(AVal); }); }); @@ -68,9 +68,9 @@ void verify_add(queue &q, buffer &a, buffer &b, range<1> &r, auto B = b.get_access(cgh); auto C = c.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::intel::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::intel::experimental::bfloat16 CVal = AVal + BVal; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal; C[index] = CVal; }); }); @@ -87,9 +87,9 @@ void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, auto B = b.get_access(cgh); auto C = c.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::intel::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::intel::experimental::bfloat16 CVal = AVal - BVal; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal; C[index] = CVal; }); }); @@ -106,9 +106,9 @@ void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, auto B = b.get_access(cgh); auto C = c.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::intel::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::intel::experimental::bfloat16 CVal = AVal * BVal; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal; C[index] = CVal; }); }); @@ -125,9 +125,9 @@ void verify_div(queue &q, buffer &a, buffer &b, range<1> &r, auto B = b.get_access(cgh); auto C = c.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::intel::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::intel::experimental::bfloat16 CVal = AVal / BVal; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal; C[index] = CVal; }); }); @@ -144,12 +144,12 @@ void verify_logic(queue &q, buffer &a, buffer &b, auto B = b.get_access(cgh); auto C = c.get_access(cgh); cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::intel::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::intel::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; if (AVal) { if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal || !BVal) { - cl::sycl::ext::intel::experimental::bfloat16 CVal = + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal != BVal ? AVal : BVal; CVal--; CVal++; From 58bf5a7e202cdd99c502970a2dca683d8cf9948f Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Fri, 1 Apr 2022 18:15:09 +0100 Subject: [PATCH 3/7] removed CUDA specific file. Integrated CUDA specific changes into bfloat16_type.cpp as well as new unary neg unit test case. --- SYCL/BFloat16/bfloat16_type-oneapi.cpp | 230 ------------------------- SYCL/BFloat16/bfloat16_type.cpp | 27 ++- 2 files changed, 25 insertions(+), 232 deletions(-) delete mode 100644 SYCL/BFloat16/bfloat16_type-oneapi.cpp diff --git a/SYCL/BFloat16/bfloat16_type-oneapi.cpp b/SYCL/BFloat16/bfloat16_type-oneapi.cpp deleted file mode 100644 index 80f9a96ea2..0000000000 --- a/SYCL/BFloat16/bfloat16_type-oneapi.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// REQUIRES: gpu, cuda - -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out - -//==----------- bfloat16_type_oneapi.cpp - SYCL bfloat16 type test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -#include - -using namespace cl::sycl; - -constexpr size_t N = 100; - -template void assert_close(const T &C, const float ref) { - for (size_t i = 0; i < N; i++) { - auto diff = C[i] - ref; - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } -} - -void verify_conv_implicit(queue &q, buffer &a, range<1> &r, - const float ref) { - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - A[index] = AVal; - }); - }); - - assert_close(a.get_access(), ref); -} - -void verify_conv_explicit(queue &q, buffer &a, range<1> &r, - const float ref) { - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - uint16_t AVal = - cl::sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]); - A[index] = cl::sycl::ext::oneapi::experimental::bfloat16::to_float(AVal); - }); - }); - - assert_close(a.get_access(), ref); -} - -void verify_add(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_minus(queue &q, buffer &a, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_div(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_logic(queue &q, buffer &a, buffer &b, - range<1> &r, const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { -cl::sycl::ext::oneapi::experimental::bfloat16 bf1(1.0f); -cl::sycl::ext::oneapi::experimental::bfloat16 bf2 = -bf1; -cl::sycl::ext::oneapi::experimental::bfloat16 bf3 = +bf1; -cl::sycl::ext::oneapi::experimental::bfloat16 bf4(+bf1); - - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - if (AVal) { - if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal || - !BVal) { - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = - AVal != BVal ? AVal : BVal; - CVal--; - CVal++; - if (AVal == BVal) { - CVal -= AVal; - CVal *= 3.0; - CVal /= 2.0; - } else - CVal += BVal; - C[index] = CVal; - } - } - - }); - }); - - assert_close(c.get_access(), ref); -} - -int main() { - device dev{default_selector()}; - - // TODO: replace is_gpu check with extension check when the appropriate part - // of implementation ready (aspect) - if (!dev.is_gpu()) { - std::cout << "This device doesn't support bfloat16 conversion feature" - << std::endl; - return 0; - } - - std::vector vec_a(N, 5.0); - std::vector vec_b(N, 2.0); - std::vector vec_b_neg(N, -2.0); - - range<1> r(N); - buffer a{vec_a.data(), r}; - buffer b{vec_b.data(), r}; - buffer b_neg{vec_b_neg.data(), r}; - - queue q{dev}; - - verify_conv_implicit(q, a, r, 5.0); - verify_conv_explicit(q, a, r, 5.0); - verify_add(q, a, b, r, 7.0); - verify_sub(q, a, b, r, 3.0); - verify_mul(q, a, b, r, 10.0); - verify_div(q, a, b, r, 2.5); - verify_logic(q, a, b, r, 7.0); - verify_add(q, a, b_neg, r, 3.0); - verify_sub(q, a, b_neg, r, 7.0); - verify_minus(q, a, r, -5.0); - verify_mul(q, a, b_neg, r, -10.0); - verify_div(q, a, b_neg, r, -2.5); - verify_logic(q, a, b_neg, r, 3.0); - - float fl1 = -1; -float fl2 = +1; - - - return 0; -} diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp index 43555e6157..e467b1d192 100644 --- a/SYCL/BFloat16/bfloat16_type.cpp +++ b/SYCL/BFloat16/bfloat16_type.cpp @@ -1,8 +1,13 @@ -// UNSUPPORTED: cuda || hip -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// UNSUPPORTED: hip +// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=cuda %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out // TODO currently the feature isn't supported on most of the devices // need to enable the test when the aspect and device_if feature are // introduced +// TODO once the above is addressed and the invocations below are switched on +// device filters should be added to ensure that the CUDA backend is not +// executed as part of these invocations. // RUNx: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %GPU_RUN_PLACEHOLDER %t.out // RUNx: %ACC_RUN_PLACEHOLDER %t.out @@ -97,6 +102,23 @@ void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, assert_close(c.get_access(), ref); } +void verify_minus(queue &q, buffer &a, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, const float ref) { buffer c{r}; @@ -199,6 +221,7 @@ int main() { verify_logic(q, a, b, r, 7.0); verify_add(q, a, b_neg, r, 3.0); verify_sub(q, a, b_neg, r, 7.0); + verify_minus(q, a, r, -5.0); verify_mul(q, a, b_neg, r, -10.0); verify_div(q, a, b_neg, r, -2.5); verify_logic(q, a, b_neg, r, 3.0); From b68845781019c455da031164c1926e55ce4362b6 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Fri, 1 Apr 2022 18:32:10 +0100 Subject: [PATCH 4/7] format --- SYCL/BFloat16/bfloat16_type.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp index e467b1d192..1c2820ec6d 100644 --- a/SYCL/BFloat16/bfloat16_type.cpp +++ b/SYCL/BFloat16/bfloat16_type.cpp @@ -7,7 +7,7 @@ // introduced // TODO once the above is addressed and the invocations below are switched on // device filters should be added to ensure that the CUDA backend is not -// executed as part of these invocations. +// executed as part of these invocations // RUNx: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %GPU_RUN_PLACEHOLDER %t.out // RUNx: %ACC_RUN_PLACEHOLDER %t.out @@ -102,8 +102,7 @@ void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, assert_close(c.get_access(), ref); } -void verify_minus(queue &q, buffer &a, range<1> &r, - const float ref) { +void verify_minus(queue &q, buffer &a, range<1> &r, const float ref) { buffer c{r}; q.submit([&](handler &cgh) { From 7a52a8d3dff1070cfc66dd41e04cf91f5f8a98b1 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Mon, 4 Apr 2022 11:17:50 +0100 Subject: [PATCH 5/7] split cuda vs others compilation into two test files. --- SYCL/BFloat16/bfloat16_type.cpp | 214 +-------------------------- SYCL/BFloat16/bfloat16_type.hpp | 207 ++++++++++++++++++++++++++ SYCL/BFloat16/bfloat16_type_cuda.cpp | 19 +++ 3 files changed, 230 insertions(+), 210 deletions(-) create mode 100644 SYCL/BFloat16/bfloat16_type.hpp create mode 100644 SYCL/BFloat16/bfloat16_type_cuda.cpp diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp index 1c2820ec6d..7a99400a32 100644 --- a/SYCL/BFloat16/bfloat16_type.cpp +++ b/SYCL/BFloat16/bfloat16_type.cpp @@ -1,13 +1,8 @@ -// UNSUPPORTED: hip -// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out -// RUN: env SYCL_DEVICE_FILTER=cuda %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // TODO currently the feature isn't supported on most of the devices // need to enable the test when the aspect and device_if feature are // introduced -// TODO once the above is addressed and the invocations below are switched on -// device filters should be added to ensure that the CUDA backend is not -// executed as part of these invocations // RUNx: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %GPU_RUN_PLACEHOLDER %t.out // RUNx: %ACC_RUN_PLACEHOLDER %t.out @@ -20,210 +15,9 @@ // //===----------------------------------------------------------------------===// -#include -#include - -#include - -using namespace cl::sycl; - -constexpr size_t N = 100; - -template void assert_close(const T &C, const float ref) { - for (size_t i = 0; i < N; i++) { - auto diff = C[i] - ref; - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } -} - -void verify_conv_implicit(queue &q, buffer &a, range<1> &r, - const float ref) { - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - A[index] = AVal; - }); - }); - - assert_close(a.get_access(), ref); -} - -void verify_conv_explicit(queue &q, buffer &a, range<1> &r, - const float ref) { - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - uint16_t AVal = - cl::sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]); - A[index] = cl::sycl::ext::oneapi::experimental::bfloat16::to_float(AVal); - }); - }); - - assert_close(a.get_access(), ref); -} - -void verify_add(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_minus(queue &q, buffer &a, range<1> &r, const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_div(queue &q, buffer &a, buffer &b, range<1> &r, - const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal; - C[index] = CVal; - }); - }); - - assert_close(c.get_access(), ref); -} - -void verify_logic(queue &q, buffer &a, buffer &b, - range<1> &r, const float ref) { - buffer c{r}; - - q.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - auto C = c.get_access(cgh); - cgh.parallel_for(r, [=](id<1> index) { - cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; - cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; - if (AVal) { - if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal || - !BVal) { - cl::sycl::ext::oneapi::experimental::bfloat16 CVal = - AVal != BVal ? AVal : BVal; - CVal--; - CVal++; - if (AVal == BVal) { - CVal -= AVal; - CVal *= 3.0; - CVal /= 2.0; - } else - CVal += BVal; - C[index] = CVal; - } - } - }); - }); - - assert_close(c.get_access(), ref); -} +#include "bfloat16_type.hpp" int main() { - device dev{default_selector()}; - - // TODO: replace is_gpu check with extension check when the appropriate part - // of implementation ready (aspect) - if (!dev.is_gpu() && !dev.is_cpu()) { - std::cout << "This device doesn't support bfloat16 conversion feature" - << std::endl; - return 0; - } - - std::vector vec_a(N, 5.0); - std::vector vec_b(N, 2.0); - std::vector vec_b_neg(N, -2.0); - - range<1> r(N); - buffer a{vec_a.data(), r}; - buffer b{vec_b.data(), r}; - buffer b_neg{vec_b_neg.data(), r}; - - queue q{dev}; - - verify_conv_implicit(q, a, r, 5.0); - verify_conv_explicit(q, a, r, 5.0); - verify_add(q, a, b, r, 7.0); - verify_sub(q, a, b, r, 3.0); - verify_mul(q, a, b, r, 10.0); - verify_div(q, a, b, r, 2.5); - verify_logic(q, a, b, r, 7.0); - verify_add(q, a, b_neg, r, 3.0); - verify_sub(q, a, b_neg, r, 7.0); - verify_minus(q, a, r, -5.0); - verify_mul(q, a, b_neg, r, -10.0); - verify_div(q, a, b_neg, r, -2.5); - verify_logic(q, a, b_neg, r, 3.0); - return 0; + return run_tests(); } diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp new file mode 100644 index 0000000000..99185be708 --- /dev/null +++ b/SYCL/BFloat16/bfloat16_type.hpp @@ -0,0 +1,207 @@ +#include +#include + +#include + +using namespace cl::sycl; + +constexpr size_t N = 100; + +template void assert_close(const T &C, const float ref) { + for (size_t i = 0; i < N; i++) { + auto diff = C[i] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} + +void verify_conv_implicit(queue &q, buffer &a, range<1> &r, + const float ref) { + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + A[index] = AVal; + }); + }); + + assert_close(a.get_access(), ref); +} + +void verify_conv_explicit(queue &q, buffer &a, range<1> &r, + const float ref) { + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + uint16_t AVal = + cl::sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]); + A[index] = cl::sycl::ext::oneapi::experimental::bfloat16::to_float(AVal); + }); + }); + + assert_close(a.get_access(), ref); +} + +void verify_add(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_sub(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_minus(queue &q, buffer &a, range<1> &r, const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_mul(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_div(queue &q, buffer &a, buffer &b, range<1> &r, + const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal; + C[index] = CVal; + }); + }); + + assert_close(c.get_access(), ref); +} + +void verify_logic(queue &q, buffer &a, buffer &b, + range<1> &r, const float ref) { + buffer c{r}; + + q.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + auto C = c.get_access(cgh); + cgh.parallel_for(r, [=](id<1> index) { + cl::sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]}; + cl::sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]}; + if (AVal) { + if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal || + !BVal) { + cl::sycl::ext::oneapi::experimental::bfloat16 CVal = + AVal != BVal ? AVal : BVal; + CVal--; + CVal++; + if (AVal == BVal) { + CVal -= AVal; + CVal *= 3.0; + CVal /= 2.0; + } else + CVal += BVal; + C[index] = CVal; + } + } + }); + }); + + assert_close(c.get_access(), ref); +} + +int run_tests() { + device dev{default_selector()}; + + // TODO: replace is_gpu check with extension check when the appropriate part + // of implementation ready (aspect) + if (!dev.is_gpu() && !dev.is_cpu()) { + std::cout << "This device doesn't support bfloat16 conversion feature" + << std::endl; + return 0; + } + + std::vector vec_a(N, 5.0); + std::vector vec_b(N, 2.0); + std::vector vec_b_neg(N, -2.0); + + range<1> r(N); + buffer a{vec_a.data(), r}; + buffer b{vec_b.data(), r}; + buffer b_neg{vec_b_neg.data(), r}; + + queue q{dev}; + + verify_conv_implicit(q, a, r, 5.0); + verify_conv_explicit(q, a, r, 5.0); + verify_add(q, a, b, r, 7.0); + verify_sub(q, a, b, r, 3.0); + verify_mul(q, a, b, r, 10.0); + verify_div(q, a, b, r, 2.5); + verify_logic(q, a, b, r, 7.0); + verify_add(q, a, b_neg, r, 3.0); + verify_sub(q, a, b_neg, r, 7.0); + verify_minus(q, a, r, -5.0); + verify_mul(q, a, b_neg, r, -10.0); + verify_div(q, a, b_neg, r, -2.5); + verify_logic(q, a, b_neg, r, 3.0); + + return 0; +} \ No newline at end of file diff --git a/SYCL/BFloat16/bfloat16_type_cuda.cpp b/SYCL/BFloat16/bfloat16_type_cuda.cpp new file mode 100644 index 0000000000..faca071f77 --- /dev/null +++ b/SYCL/BFloat16/bfloat16_type_cuda.cpp @@ -0,0 +1,19 @@ +// REQUIRES: gpu, cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out +// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test execution once it does. +// RUNx: %t.out + +//==--------- bfloat16_type_cuda.cpp - SYCL bfloat16 type test -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bfloat16_type.hpp" + +int main() { + + return run_tests(); +} From fd380a0f6f8c43a0f4a8b7f3b8ab47039f26346b Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Mon, 4 Apr 2022 11:27:58 +0100 Subject: [PATCH 6/7] format --- SYCL/BFloat16/bfloat16_type.cpp | 5 +---- SYCL/BFloat16/bfloat16_type_cuda.cpp | 8 +++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp index 7a99400a32..25bb8ac15c 100644 --- a/SYCL/BFloat16/bfloat16_type.cpp +++ b/SYCL/BFloat16/bfloat16_type.cpp @@ -17,7 +17,4 @@ #include "bfloat16_type.hpp" -int main() { - - return run_tests(); -} +int main() { return run_tests(); } diff --git a/SYCL/BFloat16/bfloat16_type_cuda.cpp b/SYCL/BFloat16/bfloat16_type_cuda.cpp index faca071f77..89e46884b4 100644 --- a/SYCL/BFloat16/bfloat16_type_cuda.cpp +++ b/SYCL/BFloat16/bfloat16_type_cuda.cpp @@ -1,6 +1,7 @@ // REQUIRES: gpu, cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out -// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test execution once it does. +// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test +// execution once it does. // RUNx: %t.out //==--------- bfloat16_type_cuda.cpp - SYCL bfloat16 type test -------------==// @@ -13,7 +14,4 @@ #include "bfloat16_type.hpp" -int main() { - - return run_tests(); -} +int main() { return run_tests(); } From 5f0a15769d5e7fde9cf8299e1f9233b67da8febd Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Mon, 4 Apr 2022 14:58:14 +0100 Subject: [PATCH 7/7] added newline at file end --- SYCL/BFloat16/bfloat16_type.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp index 99185be708..230757b3be 100644 --- a/SYCL/BFloat16/bfloat16_type.hpp +++ b/SYCL/BFloat16/bfloat16_type.hpp @@ -204,4 +204,4 @@ int run_tests() { verify_logic(q, a, b_neg, r, 3.0); return 0; -} \ No newline at end of file +}