From d533847904a7b58e05dc05d3e640e85d6f0adb12 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Mon, 20 Jun 2022 11:23:53 +0800 Subject: [PATCH 01/14] [SYCL][libdevice] Add SIMD emulation APIs for imf libdevice Signed-off-by: jinge90 --- libdevice/cmake/modules/ImfSrcConcate.cmake | 1 + libdevice/cmake/modules/SYCLLibdevice.cmake | 1 + libdevice/device_imf.hpp | 7 +- libdevice/imf_utils/simd_emulate.cpp | 223 ++++++++++++++++++ libdevice/imf_wrapper.cpp | 92 ++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 12 + sycl/include/CL/sycl/builtins.hpp | 17 ++ 7 files changed, 350 insertions(+), 3 deletions(-) create mode 100644 libdevice/imf_utils/simd_emulate.cpp diff --git a/libdevice/cmake/modules/ImfSrcConcate.cmake b/libdevice/cmake/modules/ImfSrcConcate.cmake index 4ea04f9d6f72a..59e40736289f6 100644 --- a/libdevice/cmake/modules/ImfSrcConcate.cmake +++ b/libdevice/cmake/modules/ImfSrcConcate.cmake @@ -1,6 +1,7 @@ set(imf_fp32_fallback_src_list imf_utils/integer_misc.cpp imf_utils/half_convert.cpp imf_utils/float_convert.cpp + imf_utils/simd_emulate.cpp imf/imf_inline_fp32.cpp) set(imf_fp64_fallback_src_list imf_utils/double_convert.cpp diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index 6954133d1cb2d..7596355b0beb1 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -123,6 +123,7 @@ set(imf_fallback_fp32_deps device.h device_imf.hpp imf_half.hpp imf_utils/integer_misc.cpp imf_utils/float_convert.cpp imf_utils/half_convert.cpp + imf_utils/simd_emulate.cpp imf/imf_inline_fp32.cpp) set(imf_fallback_fp64_deps device.h device_imf.hpp imf_half.hpp imf_utils/double_convert.cpp diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 9466f2ce3970a..5d8752ac3ad98 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -458,9 +458,10 @@ static inline int __popcll(unsigned long long int x) { #endif } -static inline unsigned int __abs(int x) { return x < 0 ? -x : x; } - -static inline unsigned long long int __abs(long long int x) { +template +static inline typename std::make_unsigned::type __abs(T x) { + static_assert((std::is_signed::value && std::is_integral::value), + "__abs can only accept signed integral type."); return x < 0 ? -x : x; } diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp new file mode 100644 index 0000000000000..4fdccb4a567d2 --- /dev/null +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -0,0 +1,223 @@ +//==------ simd_emulate.cpp - serial implementation to emulate simd functions +// ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../device_imf.hpp" +#include +#ifdef __LIBDEVICE_IMF_ENABLED__ + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int x) { + uint16_t res_buf[2] = { + 0, + }; + for (size_t idx = 0; idx < 2; ++idx) { + int16_t tmp = __bit_cast( + __get_bytes_by_index(x, idx)); + res_buf[idx] = __bit_cast(__abs(tmp)); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs4(unsigned int x) { + uint8_t res_buf[4] = { + 0, + }; + for (size_t idx = 0; idx < 4; ++idx) { + int8_t tmp = + __bit_cast(__get_bytes_by_index(x, idx)); + res_buf[idx] = __bit_cast(__abs(tmp)); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss2(unsigned int x) { + uint16_t res_buf[2] = { + 0, + }; + for (size_t idx = 0; idx < 2; ++idx) { + uint16_t tmp = __get_bytes_by_index(x, idx); + if (tmp == 0x8000) + res_buf[idx] = 0x7FFF; + else { + int16_t s_tmp = __bit_cast(tmp); + res_buf[idx] = __bit_cast(__abs(s_tmp)); + } + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss4(unsigned int x) { + uint8_t res_buf[4] = { + 0, + }; + for (size_t idx = 0; idx < 4; ++idx) { + uint8_t tmp = __get_bytes_by_index(x, idx); + if (tmp == 0x80) + res_buf[idx] = 0x7F; + else { + int8_t s_tmp = __bit_cast(tmp); + res_buf[idx] = __bit_cast(__abs(s_tmp)); + } + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + int32_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = static_cast(__bit_cast( + __get_bytes_by_index(x, idx))); + y_tmp = static_cast(__bit_cast( + __get_bytes_by_index(y, idx))); + x_tmp -= y_tmp; + res_buf[idx] = static_cast(__abs(x_tmp)); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + int16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = static_cast(__bit_cast( + __get_bytes_by_index(x, idx))); + y_tmp = static_cast(__bit_cast( + __get_bytes_by_index(y, idx))); + x_tmp -= y_tmp; + res_buf[idx] = static_cast(__abs(x_tmp)); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + uint16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + if (x_tmp < y_tmp) + std::swap(x_tmp, y_tmp); + x_tmp -= y_tmp; + res_buf[idx] = x_tmp; + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + uint8_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + if (x_tmp < y_tmp) + std::swap(x_tmp, y_tmp); + x_tmp -= y_tmp; + res_buf[idx] = x_tmp; + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + + uint32_t tmp; + uint16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + tmp = x_tmp + y_tmp; + res_buf[idx] = __get_bytes_by_index(tmp, 0); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + + uint16_t tmp; + uint8_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + tmp = x_tmp + y_tmp; + res_buf[idx] = __get_bytes_by_index(tmp, 0); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + + int32_t tmp; + int16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = __bit_cast( + __get_bytes_by_index(x, idx)); + y_tmp = __bit_cast( + __get_bytes_by_index(y, idx)); + tmp = x_tmp + y_tmp; + if (tmp > 32767) + res_buf[idx] = 0x7FFF; + else if (tmp < -32768) + res_buf[idx] = 0x8000; + else + res_buf[idx] = __get_bytes_by_index(tmp, 0); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + + int16_t tmp; + int8_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = + __bit_cast(__get_bytes_by_index(x, idx)); + y_tmp = + __bit_cast(__get_bytes_by_index(y, idx)); + tmp = x_tmp + y_tmp; + if (tmp > 127) + res_buf[idx] = 0x7F; + else if (tmp < -128) + res_buf[idx] = 0x80; + else + res_buf[idx] = __get_bytes_by_index(tmp, 0); + } + return __assemble_integral_value(res_buf); +} +#endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index 6197845aad440..23521de7cb773 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -627,4 +627,96 @@ _iml_half_internal __imf_copysignf16(_iml_half_internal x, _iml_half_internal y) { return __devicelib_imf_copysignf16(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabs4(unsigned int x) { return __devicelib_imf_vabs4(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsss2(unsigned int x) { + return __devicelib_imf_vabsss2(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsss4(unsigned int x) { + return __devicelib_imf_vabsss4(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vadd2(unsigned int x, unsigned int y) { + return __devicelib_imf_vadd2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vadd4(unsigned int x, unsigned int y) { + return __devicelib_imf_vadd4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddss2(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddss2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddss4(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddss4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index ca20506d14160..03b71d5518705 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -254,6 +254,18 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_fmaxf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fminf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_copysignf16", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vadd2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vadd4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddss4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 3ca2f945be111..d806f7ea24d64 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1879,6 +1879,23 @@ extern SYCL_EXTERNAL double __imf_uint2double_rn(unsigned int x); extern SYCL_EXTERNAL double __imf_uint2double_ru(unsigned int x); extern SYCL_EXTERNAL double __imf_uint2double_rz(unsigned int x); extern SYCL_EXTERNAL double __imf_hiloint2double(int hi, int lo); + +extern SYCL_EXTERNAL unsigned int __imf_vabs2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabs4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsss2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsss4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vadd2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vadd4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int x, unsigned int y); } #ifdef __GLIBC__ extern "C" { From 0469d36abcff41c3380915c9b74b7fce4f842141 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Mon, 20 Jun 2022 13:55:54 +0800 Subject: [PATCH 02/14] Remove inclusion Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 8 ++++++++ libdevice/imf_utils/simd_emulate.cpp | 5 ++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 5d8752ac3ad98..47726bbece9c1 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -465,6 +465,14 @@ static inline typename std::make_unsigned::type __abs(T x) { return x < 0 ? -x : x; } +template +static inline void __swap(T &x, T &y) { + static_assert(std::is_integral::value, "__swap can only accept integral type."); + T tmp = x; + x = y; + y = x; +} + template static inline Ty2 __get_bytes_by_index(Ty1 x, size_t idx) { static_assert(!std::is_signed::value && !std::is_signed::value, diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 4fdccb4a567d2..e8adbe14bdf3a 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -8,7 +8,6 @@ //===----------------------------------------------------------------------===// #include "../device_imf.hpp" -#include #ifdef __LIBDEVICE_IMF_ENABLED__ DEVICE_EXTERN_C_INLINE @@ -115,7 +114,7 @@ unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { x_tmp = __get_bytes_by_index(x, idx); y_tmp = __get_bytes_by_index(y, idx); if (x_tmp < y_tmp) - std::swap(x_tmp, y_tmp); + __swap(x_tmp, y_tmp); x_tmp -= y_tmp; res_buf[idx] = x_tmp; } @@ -132,7 +131,7 @@ unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { x_tmp = __get_bytes_by_index(x, idx); y_tmp = __get_bytes_by_index(y, idx); if (x_tmp < y_tmp) - std::swap(x_tmp, y_tmp); + __swap(x_tmp, y_tmp); x_tmp -= y_tmp; res_buf[idx] = x_tmp; } From 62a13d36fc4dfac9859cb3209c7ec97727547c10 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Mon, 20 Jun 2022 14:08:27 +0800 Subject: [PATCH 03/14] Fix clang format issue Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 47726bbece9c1..6f4f42ef5e91c 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -465,9 +465,9 @@ static inline typename std::make_unsigned::type __abs(T x) { return x < 0 ? -x : x; } -template -static inline void __swap(T &x, T &y) { - static_assert(std::is_integral::value, "__swap can only accept integral type."); +template static inline void __swap(T &x, T &y) { + static_assert(std::is_integral::value, + "__swap can only accept integral type."); T tmp = x; x = y; y = x; From 9eadc0ed7d539ae40145e10c5d80e6282b40fe68 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Fri, 24 Jun 2022 16:46:40 +0800 Subject: [PATCH 04/14] Add __imf_vcmp* functions part1 Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 3 +- libdevice/imf_utils/simd_emulate.cpp | 177 ++++++++++++++++++ libdevice/imf_wrapper.cpp | 112 +++++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 14 ++ sycl/include/CL/sycl/builtins.hpp | 22 +++ 5 files changed, 327 insertions(+), 1 deletion(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 6f4f42ef5e91c..603548e382501 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -13,6 +13,7 @@ #include "imf_half.hpp" #include #include +#include #ifdef __LIBDEVICE_IMF_ENABLED__ @@ -470,7 +471,7 @@ template static inline void __swap(T &x, T &y) { "__swap can only accept integral type."); T tmp = x; x = y; - y = x; + y = tmp; } template diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index e8adbe14bdf3a..6970c578ef795 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -219,4 +219,181 @@ unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { } return __assemble_integral_value(res_buf); } + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + uint32_t tmp; + uint16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + tmp = x_tmp + y_tmp; + if (tmp > 65535) + res_buf[idx] = 0xFFFF; + else + res_buf[idx] = static_cast(tmp); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + uint16_t tmp; + uint8_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + tmp = x_tmp + y_tmp; + if (tmp > 255) + res_buf[idx] = 0xFF; + else + res_buf[idx] = static_cast(tmp); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { + uint16_t res_buf[2] = { + 0, + }; + int16_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 2; ++idx) { + x_tmp = __bit_cast( + __get_bytes_by_index(x, idx)); + y_tmp = __bit_cast( + __get_bytes_by_index(y, idx)); + res_buf[idx] = __bit_cast(__shadd(x_tmp, y_tmp)); + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { + uint8_t res_buf[4] = { + 0, + }; + int8_t x_tmp, y_tmp; + for (size_t idx = 0; idx < 4; ++idx) { + x_tmp = + __bit_cast(__get_bytes_by_index(x, idx)); + y_tmp = + __bit_cast(__get_bytes_by_index(y, idx)); + res_buf[idx] = __bit_cast(__shadd(x_tmp, y_tmp)); + } + return __assemble_integral_value(res_buf); +} + +template +static inline unsigned int __internal_vcmps_op(unsigned int x, unsigned int y, + Comp comp) { + static_assert(std::is_same::value || + std::is_same::value, + "__internal_vcmps_op only accept int8_t and int16_t."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_vcmps_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + UTp res_buf[N] = { + 0, + }; + Tp x_tmp, y_tmp; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = __bit_cast(__get_bytes_by_index(x, idx)); + y_tmp = __bit_cast(__get_bytes_by_index(y, idx)); + if (comp(x_tmp, y_tmp)) + res_buf[idx] = static_cast(-1); + else + res_buf[idx] = 0; + } + return __assemble_integral_value(res_buf); +} + +template +static inline unsigned int __internal_vcmpu_op(unsigned int x, unsigned int y, + Comp comp) { + static_assert(std::is_same::value || + std::is_same::value, + "__internal_vcmpu_op only accept uint8_t and uint16_t."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_vcmpu_op size mismatch"); + Tp res_buf[N] = { + 0, + }; + Tp x_tmp, y_tmp; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = __get_bytes_by_index(x, idx); + y_tmp = __get_bytes_by_index(y, idx); + if (comp(x_tmp, y_tmp)) + res_buf[idx] = static_cast(-1); + else + res_buf[idx] = 0; + } + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::equal_to()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq4(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::equal_to()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges2(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::greater_equal()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges4(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::greater_equal()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu2(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::greater_equal()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu4(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::greater_equal()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts2(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::greater()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts4(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::greater()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu2(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::greater()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { + return __internal_vcmpu_op>( + x, y, std::greater()); +} #endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index 23521de7cb773..e5126624e68f4 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -664,6 +664,48 @@ unsigned int __devicelib_imf_vaddss2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddss4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } @@ -719,4 +761,74 @@ DEVICE_EXTERN_C_INLINE unsigned int __imf_vaddss4(unsigned int x, unsigned int y) { return __devicelib_imf_vaddss4(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddus2(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddus2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddus4(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddus4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpeq2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpeq4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpges2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpges2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpges4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpges4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgeu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgeu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgeu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgeu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgtu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgtu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgtu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgtu4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 03b71d5518705..f6bbec992df65 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -266,6 +266,20 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vadd4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vaddss2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vaddss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddus2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddus4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpeq2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpeq4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpges2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpges4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgeu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgeu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgtu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgtu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index d806f7ea24d64..f83163260f65f 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1896,6 +1896,28 @@ extern SYCL_EXTERNAL unsigned int __imf_vadd2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vadd4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddus2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddus4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpges2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpges4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgeu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgeu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu4(unsigned int x, + unsigned int y); } #ifdef __GLIBC__ extern "C" { From 482d684a77de16ee79b83042eae5dfb207e8eb23 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Fri, 24 Jun 2022 17:25:28 +0800 Subject: [PATCH 05/14] fix clang-format Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 2 +- libdevice/imf_utils/simd_emulate.cpp | 12 ++++++++++++ libdevice/imf_wrapper.cpp | 16 ++++++++++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 2 ++ sycl/include/CL/sycl/builtins.hpp | 4 ++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 603548e382501..114b907e8585a 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -12,8 +12,8 @@ #include "device.h" #include "imf_half.hpp" #include -#include #include +#include #ifdef __LIBDEVICE_IMF_ENABLED__ diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 6970c578ef795..88862fb9639cd 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -396,4 +396,16 @@ unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { return __internal_vcmpu_op>( x, y, std::greater()); } + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples2(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::less_equal()); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples4(unsigned int x, unsigned int y) { + return __internal_vcmps_op>( + x, y, std::less_equal()); +} #endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index e5126624e68f4..778f8eacca170 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -706,6 +706,12 @@ unsigned int __devicelib_imf_vcmpgtu2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } @@ -831,4 +837,14 @@ DEVICE_EXTERN_C_INLINE unsigned int __imf_vcmpgtu4(unsigned int x, unsigned int y) { return __devicelib_imf_vcmpgtu4(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmples2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmples2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmples4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmples4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index f6bbec992df65..2a3b1297c0e10 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -280,6 +280,8 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vcmpgts4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpgtu2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpgtu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmples2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmples4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index f83163260f65f..00475389d0c29 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1918,6 +1918,10 @@ extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmples2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmples4(unsigned int x, + unsigned int y); } #ifdef __GLIBC__ extern "C" { From 5ed3c0675d9d9f254cd8873d5d90db8016ff0434 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Mon, 27 Jun 2022 16:26:52 +0800 Subject: [PATCH 06/14] add __imf_vset* APIs Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 1 - libdevice/imf_utils/simd_emulate.cpp | 354 +++++++++++++++--- libdevice/imf_wrapper.cpp | 288 ++++++++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 36 ++ sycl/include/CL/sycl/builtins.hpp | 58 +++ 5 files changed, 694 insertions(+), 43 deletions(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 114b907e8585a..5bc72b1b8045a 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -12,7 +12,6 @@ #include "device.h" #include "imf_half.hpp" #include -#include #include #ifdef __LIBDEVICE_IMF_ENABLED__ diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 88862fb9639cd..b318500398700 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -290,122 +290,392 @@ unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { return __assemble_integral_value(res_buf); } -template -static inline unsigned int __internal_vcmps_op(unsigned int x, unsigned int y, - Comp comp) { +template class __min_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return (x < y) ? x : y; } +}; + +template class __max_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return (x > y) ? x : y; } +}; + +template class __eq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x == y) ? static_cast(-1) : 0); + } +}; + +template class __neq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x != y) ? static_cast(-1) : 0); + } +}; + +template class __set_eq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x == y) ? 1 : 0); } +}; + +template class __set_neq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x != y) ? 1 : 0); } +}; + +template class __gt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x > y) ? static_cast(-1) : 0); + } +}; + +template class __set_gt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x > y) ? 1 : 0); } +}; + +template class __ge_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x >= y) ? static_cast(-1) : 0); + } +}; + +template class __set_ge_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x >= y) ? 1 : 0); } +}; + +template class __lt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x < y) ? static_cast(-1) : 0); + } +}; + +template class __set_lt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x < y) ? 1 : 0); } +}; + +template class __le_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x <= y) ? static_cast(-1) : 0); + } +}; + +template class __set_le_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x <= y) ? 1 : 0); } +}; + +template class BinaryOp> +static inline unsigned int __internal_vs_binary_op(unsigned int x, + unsigned int y) { static_assert(std::is_same::value || std::is_same::value, - "__internal_vcmps_op only accept int8_t and int16_t."); + "__internal_vs_binary_op only accept int8_t and int16_t."); static_assert(sizeof(Tp) * N == sizeof(unsigned int), - "__internal_vcmps_op size mismatch"); + "__internal_vs_binary_op size mismatch"); typedef typename std::make_unsigned::type UTp; UTp res_buf[N] = { 0, }; Tp x_tmp, y_tmp; + BinaryOp b_op; for (size_t idx = 0; idx < N; ++idx) { x_tmp = __bit_cast(__get_bytes_by_index(x, idx)); y_tmp = __bit_cast(__get_bytes_by_index(y, idx)); - if (comp(x_tmp, y_tmp)) - res_buf[idx] = static_cast(-1); - else - res_buf[idx] = 0; + res_buf[idx] = b_op(x_tmp, y_tmp); } return __assemble_integral_value(res_buf); } -template -static inline unsigned int __internal_vcmpu_op(unsigned int x, unsigned int y, - Comp comp) { +template class BinaryOp> +static inline unsigned int __internal_vu_binary_op(unsigned int x, + unsigned int y) { static_assert(std::is_same::value || std::is_same::value, - "__internal_vcmpu_op only accept uint8_t and uint16_t."); + "__internal_vu_binary_op only accept uint8_t and uint16_t."); static_assert(sizeof(Tp) * N == sizeof(unsigned int), - "__internal_vcmpu_op size mismatch"); + "__internal_vu_binary_op size mismatch"); Tp res_buf[N] = { 0, }; Tp x_tmp, y_tmp; + BinaryOp b_op; for (size_t idx = 0; idx < N; ++idx) { x_tmp = __get_bytes_by_index(x, idx); y_tmp = __get_bytes_by_index(y, idx); - if (comp(x_tmp, y_tmp)) - res_buf[idx] = static_cast(-1); - else - res_buf[idx] = 0; + res_buf[idx] = b_op(x_tmp, y_tmp); } return __assemble_integral_value(res_buf); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::equal_to()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq4(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::equal_to()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges2(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::greater_equal()); + return __internal_vs_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges4(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::greater_equal()); + return __internal_vs_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu2(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::greater_equal()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu4(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::greater_equal()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts2(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::greater()); + return __internal_vs_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts4(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::greater()); + return __internal_vs_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu2(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::greater()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { - return __internal_vcmpu_op>( - x, y, std::greater()); + return __internal_vu_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples2(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::less_equal()); + return __internal_vs_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples4(unsigned int x, unsigned int y) { - return __internal_vcmps_op>( - x, y, std::less_equal()); + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts2(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts4(unsigned int x, unsigned int y) { + return __internal_vs_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu2(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu4(unsigned int x, unsigned int y) { + return __internal_vu_binary_op(x, y); } #endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index 778f8eacca170..008a5fa5d1728 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -682,6 +682,12 @@ unsigned int __devicelib_imf_vcmpeq2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges2(unsigned int, unsigned int); @@ -712,6 +718,108 @@ unsigned int __devicelib_imf_vcmples2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } @@ -847,4 +955,184 @@ DEVICE_EXTERN_C_INLINE unsigned int __imf_vcmples4(unsigned int x, unsigned int y) { return __devicelib_imf_vcmples4(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpleu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpleu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpleu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpleu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmplts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmplts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmplts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmplts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpltu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpltu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpltu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpltu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpne2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpne2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpne4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpne4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmins2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmins2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmins4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmins4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vminu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vminu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vminu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vminu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vseteq2(unsigned int x, unsigned int y) { + return __devicelib_imf_vseteq2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vseteq4(unsigned int x, unsigned int y) { + return __devicelib_imf_vseteq4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetne2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetne2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetne4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetne4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetges2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetges2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetges4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetges4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgeu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgeu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgeu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgeu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgtu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgtu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgtu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgtu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetles2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetles2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetles4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetles4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetleu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetleu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetleu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetleu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetlts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetlts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetlts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetlts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetltu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetltu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetltu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetltu4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 2a3b1297c0e10..9f0aae9f36c92 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -272,6 +272,8 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vavgs4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpeq2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpeq4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpne2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpne4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpges2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpges4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpgeu2", DeviceLibExt::cl_intel_devicelib_imf}, @@ -282,6 +284,40 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vcmpgtu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmples2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmples4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpleu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpleu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmplts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmplts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpltu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpltu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmins2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmins4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vminu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vminu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vseteq2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vseteq4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetne2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetne4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetges2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetges4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgeu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgeu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgtu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgtu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetles2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetles4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetleu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetleu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetlts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetlts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetltu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetltu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 00475389d0c29..f56435a30dcea 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1902,6 +1902,8 @@ extern SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpne2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpne4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpges2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpges4(unsigned int x, @@ -1922,6 +1924,62 @@ extern SYCL_EXTERNAL unsigned int __imf_vcmples2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmples4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpleu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpleu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmplts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmplts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpltu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpltu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxs2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxs4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmins2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmins4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vminu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vminu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vseteq2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vseteq4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetne2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetne4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetges2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetges4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgeu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgeu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgtu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgtu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetles2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetles4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetleu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetleu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetlts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetlts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetltu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetltu4(unsigned int x, + unsigned int y); } #ifdef __GLIBC__ extern "C" { From ccbb31dd2bf949057cdad4f97f2138e21e2770cc Mon Sep 17 00:00:00 2001 From: jinge90 Date: Tue, 28 Jun 2022 09:07:56 +0800 Subject: [PATCH 07/14] Remove redundant code for simd unary op Signed-off-by: jinge90 --- libdevice/device_imf.hpp | 2 +- libdevice/imf_utils/simd_emulate.cpp | 218 ++++++++++++--------------- 2 files changed, 99 insertions(+), 121 deletions(-) diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 5bc72b1b8045a..09bd36ceb87ee 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -12,8 +12,8 @@ #include "device.h" #include "imf_half.hpp" #include +#include #include - #ifdef __LIBDEVICE_IMF_ENABLED__ #if !defined(__SPIR__) && !defined(__LIBDEVICE_HOST_IMPL__) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index b318500398700..bf9c92e424956 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -10,64 +10,63 @@ #include "../device_imf.hpp" #ifdef __LIBDEVICE_IMF_ENABLED__ -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabs2(unsigned int x) { - uint16_t res_buf[2] = { +template class __abs_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { return __abs(x); } +}; + +template class __abss_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { + if (x == std::numeric_limits::min()) + return std::numeric_limits::max(); + else + return __abs(x); + } +}; + +template class UnaryOp> +static inline unsigned int __internal_v_unary_op(unsigned int x) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_unary_op accepts 1/2 byte integer type."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_v_unary_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + UTp res_buf[N] = { 0, }; - for (size_t idx = 0; idx < 2; ++idx) { - int16_t tmp = __bit_cast( - __get_bytes_by_index(x, idx)); - res_buf[idx] = __bit_cast(__abs(tmp)); + Tp x_tmp; + UnaryOp u_op; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + res_buf[idx] = u_op(x_tmp); } - return __assemble_integral_value(res_buf); + return __assemble_integral_value(res_buf); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int x) { + return __internal_v_unary_op(x); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabs4(unsigned int x) { - uint8_t res_buf[4] = { - 0, - }; - for (size_t idx = 0; idx < 4; ++idx) { - int8_t tmp = - __bit_cast(__get_bytes_by_index(x, idx)); - res_buf[idx] = __bit_cast(__abs(tmp)); - } - return __assemble_integral_value(res_buf); + return __internal_v_unary_op(x); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsss2(unsigned int x) { - uint16_t res_buf[2] = { - 0, - }; - for (size_t idx = 0; idx < 2; ++idx) { - uint16_t tmp = __get_bytes_by_index(x, idx); - if (tmp == 0x8000) - res_buf[idx] = 0x7FFF; - else { - int16_t s_tmp = __bit_cast(tmp); - res_buf[idx] = __bit_cast(__abs(s_tmp)); - } - } - return __assemble_integral_value(res_buf); + return __internal_v_unary_op(x); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsss4(unsigned int x) { - uint8_t res_buf[4] = { - 0, - }; - for (size_t idx = 0; idx < 4; ++idx) { - uint8_t tmp = __get_bytes_by_index(x, idx); - if (tmp == 0x80) - res_buf[idx] = 0x7F; - else { - int8_t s_tmp = __bit_cast(tmp); - res_buf[idx] = __bit_cast(__abs(s_tmp)); - } - } - return __assemble_integral_value(res_buf); + return __internal_v_unary_op(x); } DEVICE_EXTERN_C_INLINE @@ -397,13 +396,13 @@ template class __set_le_op { }; template class BinaryOp> -static inline unsigned int __internal_vs_binary_op(unsigned int x, - unsigned int y) { - static_assert(std::is_same::value || - std::is_same::value, - "__internal_vs_binary_op only accept int8_t and int16_t."); +static inline unsigned int __internal_v_binary_op(unsigned int x, + unsigned int y) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_binary_op accepts 1/2 byte integer type."); static_assert(sizeof(Tp) * N == sizeof(unsigned int), - "__internal_vs_binary_op size mismatch"); + "__internal_v_binary_op size mismatch"); typedef typename std::make_unsigned::type UTp; UTp res_buf[N] = { 0, @@ -411,271 +410,250 @@ static inline unsigned int __internal_vs_binary_op(unsigned int x, Tp x_tmp, y_tmp; BinaryOp b_op; for (size_t idx = 0; idx < N; ++idx) { - x_tmp = __bit_cast(__get_bytes_by_index(x, idx)); - y_tmp = __bit_cast(__get_bytes_by_index(y, idx)); + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + y_tmp = static_cast(__get_bytes_by_index(y, idx)); res_buf[idx] = b_op(x_tmp, y_tmp); } return __assemble_integral_value(res_buf); } -template class BinaryOp> -static inline unsigned int __internal_vu_binary_op(unsigned int x, - unsigned int y) { - static_assert(std::is_same::value || - std::is_same::value, - "__internal_vu_binary_op only accept uint8_t and uint16_t."); - static_assert(sizeof(Tp) * N == sizeof(unsigned int), - "__internal_vu_binary_op size mismatch"); - Tp res_buf[N] = { - 0, - }; - Tp x_tmp, y_tmp; - BinaryOp b_op; - for (size_t idx = 0; idx < N; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - res_buf[idx] = b_op(x_tmp, y_tmp); - } - return __assemble_integral_value(res_buf); -} - DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpleu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpleu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmplts2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmplts4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpltu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpltu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpne2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpne4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxs2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxs4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmins2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmins4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vminu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vminu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vseteq2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vseteq4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetne2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetne4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetges2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetges4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgeu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgeu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgts2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgts4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgtu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgtu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetles2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetles4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetleu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetleu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetlts2(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetlts4(unsigned int x, unsigned int y) { - return __internal_vs_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu2(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu4(unsigned int x, unsigned int y) { - return __internal_vu_binary_op(x, y); + return __internal_v_binary_op(x, y); } #endif From 3c28d6fb04b03f76f2877675c65e57354d4b99aa Mon Sep 17 00:00:00 2001 From: jinge90 Date: Tue, 28 Jun 2022 16:55:01 +0800 Subject: [PATCH 08/14] Use __internal_v_binary/unary_op to replace all redudant code Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 413 +++++++++++---------------- 1 file changed, 173 insertions(+), 240 deletions(-) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index bf9c92e424956..8cebdba1b497b 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -10,6 +10,17 @@ #include "../device_imf.hpp" #ifdef __LIBDEVICE_IMF_ENABLED__ +template struct __twice_size; +template using __twice_size_t = typename __twice_size::type; +template struct __twice_size_tag { + using type = Tp; +}; + +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; + template class __abs_op { typedef typename std::make_unsigned::type UTp; @@ -49,246 +60,6 @@ static inline unsigned int __internal_v_unary_op(unsigned int x) { return __assemble_integral_value(res_buf); } -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabs2(unsigned int x) { - return __internal_v_unary_op(x); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabs4(unsigned int x) { - return __internal_v_unary_op(x); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsss2(unsigned int x) { - return __internal_v_unary_op(x); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsss4(unsigned int x) { - return __internal_v_unary_op(x); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - int32_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = static_cast(__bit_cast( - __get_bytes_by_index(x, idx))); - y_tmp = static_cast(__bit_cast( - __get_bytes_by_index(y, idx))); - x_tmp -= y_tmp; - res_buf[idx] = static_cast(__abs(x_tmp)); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsdiffs4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - int16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = static_cast(__bit_cast( - __get_bytes_by_index(x, idx))); - y_tmp = static_cast(__bit_cast( - __get_bytes_by_index(y, idx))); - x_tmp -= y_tmp; - res_buf[idx] = static_cast(__abs(x_tmp)); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - uint16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - if (x_tmp < y_tmp) - __swap(x_tmp, y_tmp); - x_tmp -= y_tmp; - res_buf[idx] = x_tmp; - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - uint8_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - if (x_tmp < y_tmp) - __swap(x_tmp, y_tmp); - x_tmp -= y_tmp; - res_buf[idx] = x_tmp; - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vadd2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - - uint32_t tmp; - uint16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - tmp = x_tmp + y_tmp; - res_buf[idx] = __get_bytes_by_index(tmp, 0); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vadd4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - - uint16_t tmp; - uint8_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - tmp = x_tmp + y_tmp; - res_buf[idx] = __get_bytes_by_index(tmp, 0); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vaddss2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - - int32_t tmp; - int16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = __bit_cast( - __get_bytes_by_index(x, idx)); - y_tmp = __bit_cast( - __get_bytes_by_index(y, idx)); - tmp = x_tmp + y_tmp; - if (tmp > 32767) - res_buf[idx] = 0x7FFF; - else if (tmp < -32768) - res_buf[idx] = 0x8000; - else - res_buf[idx] = __get_bytes_by_index(tmp, 0); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - - int16_t tmp; - int8_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = - __bit_cast(__get_bytes_by_index(x, idx)); - y_tmp = - __bit_cast(__get_bytes_by_index(y, idx)); - tmp = x_tmp + y_tmp; - if (tmp > 127) - res_buf[idx] = 0x7F; - else if (tmp < -128) - res_buf[idx] = 0x80; - else - res_buf[idx] = __get_bytes_by_index(tmp, 0); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vaddus2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - uint32_t tmp; - uint16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - tmp = x_tmp + y_tmp; - if (tmp > 65535) - res_buf[idx] = 0xFFFF; - else - res_buf[idx] = static_cast(tmp); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - uint16_t tmp; - uint8_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = __get_bytes_by_index(x, idx); - y_tmp = __get_bytes_by_index(y, idx); - tmp = x_tmp + y_tmp; - if (tmp > 255) - res_buf[idx] = 0xFF; - else - res_buf[idx] = static_cast(tmp); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { - uint16_t res_buf[2] = { - 0, - }; - int16_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 2; ++idx) { - x_tmp = __bit_cast( - __get_bytes_by_index(x, idx)); - y_tmp = __bit_cast( - __get_bytes_by_index(y, idx)); - res_buf[idx] = __bit_cast(__shadd(x_tmp, y_tmp)); - } - return __assemble_integral_value(res_buf); -} - -DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { - uint8_t res_buf[4] = { - 0, - }; - int8_t x_tmp, y_tmp; - for (size_t idx = 0; idx < 4; ++idx) { - x_tmp = - __bit_cast(__get_bytes_by_index(x, idx)); - y_tmp = - __bit_cast(__get_bytes_by_index(y, idx)); - res_buf[idx] = __bit_cast(__shadd(x_tmp, y_tmp)); - } - return __assemble_integral_value(res_buf); -} - template class __min_op { public: Tp operator()(const Tp &x, const Tp &y) { return (x < y) ? x : y; } @@ -395,6 +166,88 @@ template class __set_le_op { UTp operator()(const Tp &x, const Tp &y) { return ((x <= y) ? 1 : 0); } }; +template class __abs_diff_s_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __abs_diff_s_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t tx = x, ty = y; + tx -= ty; + return static_cast(__abs(tx)); + } +}; + +template class __abs_diff_u_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __abs_diff_u_op"); + +public: + Tp operator()(Tp &x, Tp &y) { + if (x < y) + __swap(x, y); + x -= y; + return x; + } +}; + +template class __add_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return x + y; } +}; + +template class __add_us_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __add_us_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x + y; + if (z > std::numeric_limits::max()) + return std::numeric_limits::max(); + else + return static_cast(z); + } +}; + +// Clang will optimize this function with llvm.sadd.sat intrinsic which +// can't be handled by llvm-spirv translator, so using turn off clang +// optimization for this function to avoid llvm-spirv crash. +#pragma clang optimize off +template class __add_ss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __add_ss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x + y; + __max_op<__twice_size_t> __max_val; + __min_op<__twice_size_t> __min_val; + return static_cast( + __min_val(__max_val(z, std::numeric_limits::min()), + std::numeric_limits::max())); + } +}; +#pragma clang optimize on + +template class __avgs_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __avgs_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return static_cast(__srhadd(x, y)); + } +}; + template class BinaryOp> static inline unsigned int __internal_v_binary_op(unsigned int x, unsigned int y) { @@ -417,6 +270,86 @@ static inline unsigned int __internal_v_binary_op(unsigned int x, return __assemble_integral_value(res_buf); } +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs4(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss2(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss4(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); From 9fb259890eb430a62fda335b19c382ed35d0c7c9 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Thu, 30 Jun 2022 16:54:57 +0800 Subject: [PATCH 09/14] Add __imf_vsadu2/4 __imf_vsads2/4 Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 86 +++++++++++++++++++ libdevice/imf_wrapper.cpp | 60 +++++++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 8 ++ sycl/include/CL/sycl/builtins.hpp | 8 ++ 4 files changed, 162 insertions(+) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 8cebdba1b497b..0919cf937ce13 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -40,6 +40,31 @@ template class __abss_op { } }; +template class __neg_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __neg_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { return static_cast(-x); } +}; + +template class __negss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __negss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { + UTp tx = static_cast(-x); + if (x == std::numeric_limits::min()) + tx = static_cast(std::numeric_limits::max()); + return tx; + } +}; + template class UnaryOp> static inline unsigned int __internal_v_unary_op(unsigned int x) { static_assert(std::is_integral::value && @@ -290,6 +315,26 @@ unsigned int __devicelib_imf_vabsss4(unsigned int x) { return __internal_v_unary_op(x); } +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg2(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg4(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss2(unsigned int x) { + return __internal_v_unary_op(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss4(unsigned int x) { + return __internal_v_unary_op(x); +} + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); @@ -589,4 +634,45 @@ DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } + +template +static inline unsigned int __internal_v_sad_op(unsigned int x, unsigned int y) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_sad_op accepts 1/2 byte integer type."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_v_sad_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + unsigned int res = 0; + typedef __twice_size_t __TwiceTp; + Tp x_tmp, y_tmp; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + y_tmp = static_cast(__get_bytes_by_index(y, idx)); + if (x_tmp < y_tmp) + __swap(x_tmp, y_tmp); + res += static_cast(static_cast<__TwiceTp>(x_tmp - y_tmp)); + } + return res; +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads2(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads4(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu2(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu4(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} #endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index 008a5fa5d1728..db6bc09286b62 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -634,6 +634,18 @@ unsigned int __devicelib_imf_vabs2(unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabs4(unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss4(unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffs2(unsigned int, unsigned int); @@ -820,12 +832,40 @@ unsigned int __devicelib_imf_vsetltu2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } DEVICE_EXTERN_C_INLINE unsigned int __imf_vabs4(unsigned int x) { return __devicelib_imf_vabs4(x); } +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vneg2(unsigned int x) { return __devicelib_imf_vneg2(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vneg4(unsigned int x) { return __devicelib_imf_vneg4(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vnegss2(unsigned int x) { + return __devicelib_imf_vnegss2(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vnegss4(unsigned int x) { + return __devicelib_imf_vnegss4(x); +} + DEVICE_EXTERN_C_INLINE unsigned int __imf_vabsdiffs2(unsigned int x, unsigned int y) { return __devicelib_imf_vabsdiffs2(x, y); @@ -1135,4 +1175,24 @@ DEVICE_EXTERN_C_INLINE unsigned int __imf_vsetltu4(unsigned int x, unsigned int y) { return __devicelib_imf_vsetltu4(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsads2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsads2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsads4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsads4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsadu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsadu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsadu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsadu4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 9f0aae9f36c92..ba6221ff5fe55 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -258,6 +258,10 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vabs4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vabsss2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vabsss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vneg2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vneg4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vnegss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vnegss4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vabsdiffs2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vabsdiffs4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vabsdiffu2", DeviceLibExt::cl_intel_devicelib_imf}, @@ -318,6 +322,10 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vsetlts4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vsetltu2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vsetltu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsads2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsads4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsadu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsadu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index f56435a30dcea..3d941c00aa0f2 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1884,6 +1884,10 @@ extern SYCL_EXTERNAL unsigned int __imf_vabs2(unsigned int x); extern SYCL_EXTERNAL unsigned int __imf_vabs4(unsigned int x); extern SYCL_EXTERNAL unsigned int __imf_vabsss2(unsigned int x); extern SYCL_EXTERNAL unsigned int __imf_vabsss4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vneg2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vneg4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vnegss2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vnegss4(unsigned int x); extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs4(unsigned int x, @@ -1980,6 +1984,10 @@ extern SYCL_EXTERNAL unsigned int __imf_vsetltu2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vsetltu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsads2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsads4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsadu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsadu4(unsigned int x, unsigned int y); } #ifdef __GLIBC__ extern "C" { From a4e60b88f821d0b005da480c06e383ae7ba58220 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Fri, 1 Jul 2022 14:40:39 +0800 Subject: [PATCH 10/14] Add all SIMD APIs Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 113 ++++++++++++++++++ libdevice/imf_wrapper.cpp | 80 +++++++++++++ .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 10 ++ sycl/include/CL/sycl/builtins.hpp | 10 ++ 4 files changed, 213 insertions(+) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 0919cf937ce13..450fcbc9e1468 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -261,6 +261,47 @@ template class __add_ss_op { }; #pragma clang optimize on +template class __sub_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return x - y; } +}; + +template class __sub_us_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __add_us_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + if (x < y) + return 0; + else + return x - y; + } +}; + +// Clang will optimize this function with llvm.sadd.sat intrinsic which +// can't be handled by llvm-spirv translator, so using turn off clang +// optimization for this function to avoid llvm-spirv crash. +#pragma clang optimize off +template class __sub_ss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __add_ss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x - y; + __max_op<__twice_size_t> __max_val; + __min_op<__twice_size_t> __min_val; + return static_cast( + __min_val(__max_val(z, std::numeric_limits::min()), + std::numeric_limits::max())); + } +}; +#pragma clang optimize on + template class __avgs_op { static_assert(std::is_same::value || std::is_same::value, @@ -273,6 +314,28 @@ template class __avgs_op { } }; +template class __avgu_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __avgu_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + return __urhadd(x, y); + } +}; + +template class __uhadd_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __uhadd_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + return __uhadd(x, y);; + } +}; + template class BinaryOp> static inline unsigned int __internal_v_binary_op(unsigned int x, unsigned int y) { @@ -385,6 +448,36 @@ unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); @@ -395,6 +488,26 @@ unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_haddu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_haddu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index db6bc09286b62..d24a0315831bf 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -682,12 +682,42 @@ unsigned int __devicelib_imf_vaddus2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddus4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgs2(unsigned int, unsigned int); DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgs4(unsigned int, unsigned int); +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu4(unsigned int, unsigned int); + DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int, unsigned int); @@ -926,6 +956,46 @@ unsigned int __imf_vaddus4(unsigned int x, unsigned int y) { return __devicelib_imf_vaddus4(x, y); } +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsub2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsub2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsub4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsub4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubss2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubss2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubss4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubss4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubus2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubus2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubus4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubus4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vhaddu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vhaddu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vhaddu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vhaddu4(x, y); +} + DEVICE_EXTERN_C_INLINE unsigned int __imf_vavgs2(unsigned int x, unsigned int y) { return __devicelib_imf_vavgs2(x, y); @@ -936,6 +1006,16 @@ unsigned int __imf_vavgs4(unsigned int x, unsigned int y) { return __devicelib_imf_vavgs4(x, y); } +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgu4(x, y); +} + DEVICE_EXTERN_C_INLINE unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y) { return __devicelib_imf_vcmpeq2(x, y); diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index ba6221ff5fe55..9ce9c56ac46f7 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -272,8 +272,18 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vaddss4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vaddus2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vaddus4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsub2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsub4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubus2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubus4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vavgs2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vavgs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vhaddu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vhaddu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpeq2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpeq4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vcmpne2", DeviceLibExt::cl_intel_devicelib_imf}, diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 3d941c00aa0f2..cceaeb064eb71 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -1902,8 +1902,18 @@ extern SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vaddus2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vaddus4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsub2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsub4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubss2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubss4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubus2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubus4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vhaddu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vhaddu4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y); extern SYCL_EXTERNAL unsigned int __imf_vcmpne2(unsigned int x, unsigned int y); From 5309c30c921cb08bc6a331048b20bfb4c8e03a1b Mon Sep 17 00:00:00 2001 From: jinge90 Date: Fri, 1 Jul 2022 15:14:42 +0800 Subject: [PATCH 11/14] Fix clang format issue Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 450fcbc9e1468..3179a19112eee 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -320,9 +320,7 @@ template class __avgu_op { "Tp can only accept uint8_t, uint16_t for __avgu_op"); public: - Tp operator()(const Tp &x, const Tp &y) { - return __urhadd(x, y); - } + Tp operator()(const Tp &x, const Tp &y) { return __urhadd(x, y); } }; template class __uhadd_op { @@ -331,9 +329,7 @@ template class __uhadd_op { "Tp can only accept uint8_t, uint16_t for __uhadd_op"); public: - Tp operator()(const Tp &x, const Tp &y) { - return __uhadd(x, y);; - } + Tp operator()(const Tp &x, const Tp &y) { return __uhadd(x, y); } }; template class BinaryOp> From 595ce8c381722acac5e721a22a3e034a541ec14a Mon Sep 17 00:00:00 2001 From: jinge90 Date: Sat, 2 Jul 2022 10:53:26 +0800 Subject: [PATCH 12/14] fix incorrect __devicelib_imf_vhaddu2 name Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 3179a19112eee..1c82fd09c175b 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -495,12 +495,12 @@ unsigned int __devicelib_imf_vavgu4(unsigned int x, unsigned int y) { } DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_haddu2(unsigned int x, unsigned int y) { +unsigned int __devicelib_imf_vhaddu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE -unsigned int __devicelib_imf_haddu4(unsigned int x, unsigned int y) { +unsigned int __devicelib_imf_vhaddu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } From e3116916e28293e3586fd8491e837fb512b62431 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Tue, 5 Jul 2022 14:01:54 +0800 Subject: [PATCH 13/14] Fix bug in __imf_vcmplt4 Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 1c82fd09c175b..9357265d46938 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -581,7 +581,7 @@ unsigned int __devicelib_imf_vcmplts2(unsigned int x, unsigned int y) { DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmplts4(unsigned int x, unsigned int y) { - return __internal_v_binary_op(x, y); + return __internal_v_binary_op(x, y); } DEVICE_EXTERN_C_INLINE From 9b117ee280579ad0967622b3f3c3f7cd02d6d8bb Mon Sep 17 00:00:00 2001 From: jinge90 Date: Thu, 7 Jul 2022 00:11:05 +0800 Subject: [PATCH 14/14] Add comments to describle imf simd emulate APIs. Signed-off-by: jinge90 --- libdevice/imf_utils/simd_emulate.cpp | 238 +++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp index 9357265d46938..22ff9eccb4a57 100644 --- a/libdevice/imf_utils/simd_emulate.cpp +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -354,391 +354,621 @@ static inline unsigned int __internal_v_binary_op(unsigned int x, return __assemble_integral_value(res_buf); } +// Split 32-bit into 2 parts, each consisting of 16 bits, compute absolute +// value for each part and assemble the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabs2(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 4 parts, each consisting of 8 bits, compute absolute +// value for each part and assemble the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabs4(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 2 parts, each consisting of 16 bits, compute absolute +// value with signed saturation for each part and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsss2(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 4 parts, each consisting of 8 bits, compute absolute +// value with signed saturation for each part and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsss4(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 2 parts, each consisting of 16 bits, compute negative +// value for each part and assemble the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vneg2(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 4 parts, each consisting of 8 bits, compute negative +// value for each part and assemble the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vneg4(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 2 parts, each consisting of 16 bits, compute negative +// value with signed saturation for each part and assemble the results into +// 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vnegss2(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 4 parts, each consisting of 8 bits, compute negative +// value with signed saturation for each part and assemble the results into +// 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vnegss4(unsigned int x) { return __internal_v_unary_op(x); } +// Split 32-bit into 2 parts, each part is sigend 16-bit int, compute absolute +// difference for corresponding parts and assemble the results into 32-bit +// unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is sigend 8-bit int, compute absolute +// difference for corresponding parts and assemble the results into 32-bit +// unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffs4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is unsigend 16-bit int, compute +// absolute difference for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is unsigend 8-bit int, compute +// absolute difference for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned addition for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vadd2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned addition for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vadd4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// addition with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddss2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// addition with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// addition with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddus2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// addition with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with wrap-round for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsub2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with wrap-round for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsub4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsubss2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsubss4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsubus2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsubus4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// signed rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// signed rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vavgu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vhaddu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned average for corresponding parts and assemble the results +// into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vhaddu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 0xFFFF if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 0xFF if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpeq4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpges4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgeu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgts4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmples4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpleu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpleu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmplts2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmplts4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpltu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpltu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 0xFFFF if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpne2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 0xFF if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vcmpne4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute signed maximum value and assemble partial +// results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxs2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute signed maximum value and assemble partial results into 32-bit +// unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxs4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute unsigned maximum value and assemble partial +// results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute unsigned maximum value and assemble partial results into +// 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmaxu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute signed minimum value and assemble partial +// results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmins2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute signed minimum value and assemble partial results into 32-bit +// unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vmins4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute unsigned minimum value and assemble partial +// results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vminu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute unsigned minimum value and assemble partial results into +// 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vminu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 1 if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vseteq2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 1 if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vseteq4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 1 if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetne2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 1 if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetne4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetges2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetges4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgeu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgeu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgts2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgts4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgtu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetgtu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetles2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetles4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetleu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetleu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetlts2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetlts4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu2(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsetltu4(unsigned int x, unsigned int y) { return __internal_v_binary_op(x, y); @@ -765,21 +995,29 @@ static inline unsigned int __internal_v_sad_op(unsigned int x, unsigned int y) { return res; } +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// parts, compute absolute difference and sum them up. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsads2(unsigned int x, unsigned int y) { return __internal_v_sad_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// parts, compute absolute difference and sum them up. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsads4(unsigned int x, unsigned int y) { return __internal_v_sad_op(x, y); } +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding parts, compute absolute difference and sum them up. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsadu2(unsigned int x, unsigned int y) { return __internal_v_sad_op(x, y); } +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding parts, compute absolute difference and sum them up. DEVICE_EXTERN_C_INLINE unsigned int __devicelib_imf_vsadu4(unsigned int x, unsigned int y) { return __internal_v_sad_op(x, y);