diff --git a/libdevice/cmake/modules/ImfSrcConcate.cmake b/libdevice/cmake/modules/ImfSrcConcate.cmake index 4ea04f9d6f72a..59e40736289f6 100644 --- a/libdevice/cmake/modules/ImfSrcConcate.cmake +++ b/libdevice/cmake/modules/ImfSrcConcate.cmake @@ -1,6 +1,7 @@ set(imf_fp32_fallback_src_list imf_utils/integer_misc.cpp imf_utils/half_convert.cpp imf_utils/float_convert.cpp + imf_utils/simd_emulate.cpp imf/imf_inline_fp32.cpp) set(imf_fp64_fallback_src_list imf_utils/double_convert.cpp diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index 5cce135eb9429..9c5e9133fef64 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -123,6 +123,7 @@ set(imf_fallback_fp32_deps device.h device_imf.hpp imf_half.hpp imf_utils/integer_misc.cpp imf_utils/float_convert.cpp imf_utils/half_convert.cpp + imf_utils/simd_emulate.cpp imf/imf_inline_fp32.cpp) set(imf_fallback_fp64_deps device.h device_imf.hpp imf_half.hpp imf_utils/double_convert.cpp diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index 9466f2ce3970a..09bd36ceb87ee 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -12,8 +12,8 @@ #include "device.h" #include "imf_half.hpp" #include +#include #include - #ifdef __LIBDEVICE_IMF_ENABLED__ #if !defined(__SPIR__) && !defined(__LIBDEVICE_HOST_IMPL__) @@ -458,12 +458,21 @@ static inline int __popcll(unsigned long long int x) { #endif } -static inline unsigned int __abs(int x) { return x < 0 ? -x : x; } - -static inline unsigned long long int __abs(long long int x) { +template +static inline typename std::make_unsigned::type __abs(T x) { + static_assert((std::is_signed::value && std::is_integral::value), + "__abs can only accept signed integral type."); return x < 0 ? -x : x; } +template static inline void __swap(T &x, T &y) { + static_assert(std::is_integral::value, + "__swap can only accept integral type."); + T tmp = x; + x = y; + y = tmp; +} + template static inline Ty2 __get_bytes_by_index(Ty1 x, size_t idx) { static_assert(!std::is_signed::value && !std::is_signed::value, diff --git a/libdevice/imf_utils/simd_emulate.cpp b/libdevice/imf_utils/simd_emulate.cpp new file mode 100644 index 0000000000000..22ff9eccb4a57 --- /dev/null +++ b/libdevice/imf_utils/simd_emulate.cpp @@ -0,0 +1,1025 @@ +//==------ simd_emulate.cpp - serial implementation to emulate simd functions +// ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../device_imf.hpp" +#ifdef __LIBDEVICE_IMF_ENABLED__ + +template struct __twice_size; +template using __twice_size_t = typename __twice_size::type; +template struct __twice_size_tag { + using type = Tp; +}; + +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; +template <> struct __twice_size : __twice_size_tag {}; + +template class __abs_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { return __abs(x); } +}; + +template class __abss_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { + if (x == std::numeric_limits::min()) + return std::numeric_limits::max(); + else + return __abs(x); + } +}; + +template class __neg_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __neg_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { return static_cast(-x); } +}; + +template class __negss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __negss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x) { + UTp tx = static_cast(-x); + if (x == std::numeric_limits::min()) + tx = static_cast(std::numeric_limits::max()); + return tx; + } +}; + +template class UnaryOp> +static inline unsigned int __internal_v_unary_op(unsigned int x) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_unary_op accepts 1/2 byte integer type."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_v_unary_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + UTp res_buf[N] = { + 0, + }; + Tp x_tmp; + UnaryOp u_op; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + res_buf[idx] = u_op(x_tmp); + } + return __assemble_integral_value(res_buf); +} + +template class __min_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return (x < y) ? x : y; } +}; + +template class __max_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return (x > y) ? x : y; } +}; + +template class __eq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x == y) ? static_cast(-1) : 0); + } +}; + +template class __neq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x != y) ? static_cast(-1) : 0); + } +}; + +template class __set_eq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x == y) ? 1 : 0); } +}; + +template class __set_neq_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x != y) ? 1 : 0); } +}; + +template class __gt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x > y) ? static_cast(-1) : 0); + } +}; + +template class __set_gt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x > y) ? 1 : 0); } +}; + +template class __ge_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x >= y) ? static_cast(-1) : 0); + } +}; + +template class __set_ge_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x >= y) ? 1 : 0); } +}; + +template class __lt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x < y) ? static_cast(-1) : 0); + } +}; + +template class __set_lt_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x < y) ? 1 : 0); } +}; + +template class __le_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return ((x <= y) ? static_cast(-1) : 0); + } +}; + +template class __set_le_op { + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { return ((x <= y) ? 1 : 0); } +}; + +template class __abs_diff_s_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __abs_diff_s_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t tx = x, ty = y; + tx -= ty; + return static_cast(__abs(tx)); + } +}; + +template class __abs_diff_u_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __abs_diff_u_op"); + +public: + Tp operator()(Tp &x, Tp &y) { + if (x < y) + __swap(x, y); + x -= y; + return x; + } +}; + +template class __add_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return x + y; } +}; + +template class __add_us_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __add_us_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x + y; + if (z > std::numeric_limits::max()) + return std::numeric_limits::max(); + else + return static_cast(z); + } +}; + +// Clang will optimize this function with llvm.sadd.sat intrinsic which +// can't be handled by llvm-spirv translator, so using turn off clang +// optimization for this function to avoid llvm-spirv crash. +#pragma clang optimize off +template class __add_ss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __add_ss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x + y; + __max_op<__twice_size_t> __max_val; + __min_op<__twice_size_t> __min_val; + return static_cast( + __min_val(__max_val(z, std::numeric_limits::min()), + std::numeric_limits::max())); + } +}; +#pragma clang optimize on + +template class __sub_op { +public: + Tp operator()(const Tp &x, const Tp &y) { return x - y; } +}; + +template class __sub_us_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __add_us_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { + if (x < y) + return 0; + else + return x - y; + } +}; + +// Clang will optimize this function with llvm.sadd.sat intrinsic which +// can't be handled by llvm-spirv translator, so using turn off clang +// optimization for this function to avoid llvm-spirv crash. +#pragma clang optimize off +template class __sub_ss_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __add_ss_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + __twice_size_t z = x - y; + __max_op<__twice_size_t> __max_val; + __min_op<__twice_size_t> __min_val; + return static_cast( + __min_val(__max_val(z, std::numeric_limits::min()), + std::numeric_limits::max())); + } +}; +#pragma clang optimize on + +template class __avgs_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept int8_t, int16_t for __avgs_op"); + typedef typename std::make_unsigned::type UTp; + +public: + UTp operator()(const Tp &x, const Tp &y) { + return static_cast(__srhadd(x, y)); + } +}; + +template class __avgu_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __avgu_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { return __urhadd(x, y); } +}; + +template class __uhadd_op { + static_assert(std::is_same::value || + std::is_same::value, + "Tp can only accept uint8_t, uint16_t for __uhadd_op"); + +public: + Tp operator()(const Tp &x, const Tp &y) { return __uhadd(x, y); } +}; + +template class BinaryOp> +static inline unsigned int __internal_v_binary_op(unsigned int x, + unsigned int y) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_binary_op accepts 1/2 byte integer type."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_v_binary_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + UTp res_buf[N] = { + 0, + }; + Tp x_tmp, y_tmp; + BinaryOp b_op; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + y_tmp = static_cast(__get_bytes_by_index(y, idx)); + res_buf[idx] = b_op(x_tmp, y_tmp); + } + return __assemble_integral_value(res_buf); +} + +// Split 32-bit into 2 parts, each consisting of 16 bits, compute absolute +// value for each part and assemble the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 4 parts, each consisting of 8 bits, compute absolute +// value for each part and assemble the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs4(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 2 parts, each consisting of 16 bits, compute absolute +// value with signed saturation for each part and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss2(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 4 parts, each consisting of 8 bits, compute absolute +// value with signed saturation for each part and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss4(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 2 parts, each consisting of 16 bits, compute negative +// value for each part and assemble the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg2(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 4 parts, each consisting of 8 bits, compute negative +// value for each part and assemble the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg4(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 2 parts, each consisting of 16 bits, compute negative +// value with signed saturation for each part and assemble the results into +// 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss2(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 4 parts, each consisting of 8 bits, compute negative +// value with signed saturation for each part and assemble the results into +// 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss4(unsigned int x) { + return __internal_v_unary_op(x); +} + +// Split 32-bit into 2 parts, each part is sigend 16-bit int, compute absolute +// difference for corresponding parts and assemble the results into 32-bit +// unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is sigend 8-bit int, compute absolute +// difference for corresponding parts and assemble the results into 32-bit +// unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is unsigend 16-bit int, compute +// absolute difference for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is unsigend 8-bit int, compute +// absolute difference for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned addition for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned addition for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// addition with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// addition with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// addition with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// addition with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with wrap-round for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with wrap-round for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with signed saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// subtraction with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// subtraction with unsigned saturation for corresponding parts and assemble +// the results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// signed rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// signed rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned rounded average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits, compute +// unsigned average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits, compute +// unsigned average for corresponding parts and assemble the results +// into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 0xFFFF if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 0xFF if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 0xFFFF if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 0xFF if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 0xFFFF if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 0xFF if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 0xFFFF if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 0xFF if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute signed maximum value and assemble partial +// results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute signed maximum value and assemble partial results into 32-bit +// unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute unsigned maximum value and assemble partial +// results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute unsigned maximum value and assemble partial results into +// 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute signed minimum value and assemble partial +// results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute signed minimum value and assemble partial results into 32-bit +// unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. +// For corresponding parts, compute unsigned minimum value and assemble partial +// results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. For corresponding +// parts, compute unsigned minimum value and assemble partial results into +// 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 1 if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 1 if they are equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part consisting of 16 bits. Compare +// corresponding parts, return 1 if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part consisting of 8 bits. Compare +// corresponding parts, return 1 if they are not equal, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x >= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x >= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x > y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x > y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x <= y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x <= y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// part from x and y, return 1 if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// part from x and y, return 1 if x < y, otherwise return 0. Assemble +// partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding part from x and y, return 1 if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu2(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding part from x and y, return 1 if x < y, otherwise return 0. +// Assemble partial results into 32-bit unsigned int. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu4(unsigned int x, unsigned int y) { + return __internal_v_binary_op(x, y); +} + +template +static inline unsigned int __internal_v_sad_op(unsigned int x, unsigned int y) { + static_assert(std::is_integral::value && + (sizeof(Tp) == 1 || sizeof(Tp) == 2), + "__internal_v_sad_op accepts 1/2 byte integer type."); + static_assert(sizeof(Tp) * N == sizeof(unsigned int), + "__internal_v_sad_op size mismatch"); + typedef typename std::make_unsigned::type UTp; + unsigned int res = 0; + typedef __twice_size_t __TwiceTp; + Tp x_tmp, y_tmp; + for (size_t idx = 0; idx < N; ++idx) { + x_tmp = static_cast(__get_bytes_by_index(x, idx)); + y_tmp = static_cast(__get_bytes_by_index(y, idx)); + if (x_tmp < y_tmp) + __swap(x_tmp, y_tmp); + res += static_cast(static_cast<__TwiceTp>(x_tmp - y_tmp)); + } + return res; +} + +// Split 32-bit into 2 parts, each part is 16-bit signed int. For corresponding +// parts, compute absolute difference and sum them up. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads2(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit signed int. For corresponding +// parts, compute absolute difference and sum them up. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads4(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +// Split 32-bit into 2 parts, each part is 16-bit unsigned int. For +// corresponding parts, compute absolute difference and sum them up. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu2(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} + +// Split 32-bit into 4 parts, each part is 8-bit unsigned int. For +// corresponding parts, compute absolute difference and sum them up. +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu4(unsigned int x, unsigned int y) { + return __internal_v_sad_op(x, y); +} +#endif diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index 6197845aad440..d24a0315831bf 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -627,4 +627,652 @@ _iml_half_internal __imf_copysignf16(_iml_half_internal x, _iml_half_internal y) { return __devicelib_imf_copysignf16(x, y); } + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabs4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vneg4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vnegss4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsdiffu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss2(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vabsss4(unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vadd4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddss4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vaddus4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsub4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubss4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsubus4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vhaddu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vavgu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpeq4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpne4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpges4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgeu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpgtu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmples4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpleu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmplts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vcmpltu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxs4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmaxu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vmins4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vminu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vseteq4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetne4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetges4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgeu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetgtu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetles4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetleu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetlts4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsetltu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsads4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu2(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_vsadu4(unsigned int, unsigned int); + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabs2(unsigned int x) { return __devicelib_imf_vabs2(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabs4(unsigned int x) { return __devicelib_imf_vabs4(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vneg2(unsigned int x) { return __devicelib_imf_vneg2(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vneg4(unsigned int x) { return __devicelib_imf_vneg4(x); } + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vnegss2(unsigned int x) { + return __devicelib_imf_vnegss2(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vnegss4(unsigned int x) { + return __devicelib_imf_vnegss4(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsdiffu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vabsdiffu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsss2(unsigned int x) { + return __devicelib_imf_vabsss2(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vabsss4(unsigned int x) { + return __devicelib_imf_vabsss4(x); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vadd2(unsigned int x, unsigned int y) { + return __devicelib_imf_vadd2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vadd4(unsigned int x, unsigned int y) { + return __devicelib_imf_vadd4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddss2(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddss2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddss4(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddss4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddus2(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddus2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vaddus4(unsigned int x, unsigned int y) { + return __devicelib_imf_vaddus4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsub2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsub2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsub4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsub4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubss2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubss2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubss4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubss4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubus2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubus2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsubus4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsubus4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vhaddu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vhaddu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vhaddu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vhaddu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vavgu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vavgu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpeq2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpeq4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpges2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpges2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpges4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpges4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgeu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgeu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgeu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgeu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgtu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgtu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpgtu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpgtu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmples2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmples2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmples4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmples4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpleu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpleu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpleu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpleu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmplts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmplts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmplts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmplts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpltu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpltu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpltu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpltu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpne2(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpne2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vcmpne4(unsigned int x, unsigned int y) { + return __devicelib_imf_vcmpne4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxs2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxs2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxs4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxs4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmaxu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmaxu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmins2(unsigned int x, unsigned int y) { + return __devicelib_imf_vmins2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vmins4(unsigned int x, unsigned int y) { + return __devicelib_imf_vmins4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vminu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vminu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vminu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vminu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vseteq2(unsigned int x, unsigned int y) { + return __devicelib_imf_vseteq2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vseteq4(unsigned int x, unsigned int y) { + return __devicelib_imf_vseteq4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetne2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetne2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetne4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetne4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetges2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetges2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetges4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetges4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgeu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgeu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgeu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgeu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgtu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgtu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetgtu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetgtu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetles2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetles2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetles4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetles4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetleu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetleu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetleu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetleu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetlts2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetlts2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetlts4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetlts4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetltu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetltu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsetltu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsetltu4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsads2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsads2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsads4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsads4(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsadu2(unsigned int x, unsigned int y) { + return __devicelib_imf_vsadu2(x, y); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_vsadu4(unsigned int x, unsigned int y) { + return __devicelib_imf_vsadu4(x, y); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index ca20506d14160..9ce9c56ac46f7 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -254,6 +254,88 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_fmaxf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fminf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_copysignf16", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vneg2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vneg4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vnegss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vnegss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vabsdiffu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vadd2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vadd4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddus2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vaddus4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsub2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsub4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubss2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubss4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubus2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsubus4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vavgu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vhaddu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vhaddu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpeq2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpeq4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpne2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpne4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpges2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpges4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgeu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgeu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgtu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpgtu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmples2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmples4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpleu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpleu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmplts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmplts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpltu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vcmpltu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxs2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxs4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmaxu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmins2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vmins4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vminu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vminu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vseteq2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vseteq4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetne2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetne4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetges2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetges4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgeu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgeu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgtu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetgtu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetles2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetles4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetleu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetleu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetlts2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetlts4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetltu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsetltu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsads2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsads4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsadu2", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_vsadu4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index e2d259cde58cf..a52247215cee1 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -1879,6 +1879,125 @@ extern SYCL_EXTERNAL double __imf_uint2double_rn(unsigned int x); extern SYCL_EXTERNAL double __imf_uint2double_ru(unsigned int x); extern SYCL_EXTERNAL double __imf_uint2double_rz(unsigned int x); extern SYCL_EXTERNAL double __imf_hiloint2double(int hi, int lo); + +extern SYCL_EXTERNAL unsigned int __imf_vabs2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabs4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsss2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsss4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vneg2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vneg4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vnegss2(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vnegss4(unsigned int x); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffs4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vabsdiffu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vadd2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vadd4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddus2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vaddus4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsub2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsub4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubss2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubss4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubus2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsubus4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vavgu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vhaddu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vhaddu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpne2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpne4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpges2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpges4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgeu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgeu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpgtu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmples2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmples4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpleu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpleu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmplts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmplts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpltu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vcmpltu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxs2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxs4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmaxu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmins2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vmins4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vminu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vminu4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vseteq2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vseteq4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetne2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetne4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetges2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetges4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgeu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgeu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgtu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetgtu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetles2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetles4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetleu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetleu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetlts2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetlts4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetltu2(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsetltu4(unsigned int x, + unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsads2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsads4(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsadu2(unsigned int x, unsigned int y); +extern SYCL_EXTERNAL unsigned int __imf_vsadu4(unsigned int x, unsigned int y); } #ifdef __GLIBC__ extern "C" {