Skip to content

[SYCL][ESIMD][NFC] Fix namespace of ESIMD implementation details. #3487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 17 additions & 22 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_host_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@

#define SIMDCF_ELEMENT_SKIP(i)

namespace cl {
__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {

namespace detail {
namespace half_impl {
class half;
} // namespace half_impl
} // namespace detail
} // namespace sycl
} // namespace cl

using half = cl::sycl::detail::half_impl::half;

namespace EsimdEmulSys {
namespace INTEL {
namespace gpu {
namespace emu {
namespace detail {

constexpr int sat_is_on = 1;

Expand All @@ -44,14 +44,10 @@ template <typename RT> struct satur {
return (RT)val;
}

#ifdef max
#undef max
#endif
#ifdef min
#undef min
#endif
const RT t_max = std::numeric_limits<RT>::max();
const RT t_min = std::numeric_limits<RT>::min();
// min/max can be macros on Windows, so wrap them into parens to avoid their
// expansion
const RT t_max = (std::numeric_limits<RT>::max)();
const RT t_min = (std::numeric_limits<RT>::min)();

if (val > t_max) {
return t_max;
Expand Down Expand Up @@ -112,8 +108,6 @@ template <> struct SetSatur<double, true> {
static unsigned int set() { return sat_is_on; }
};

} // namespace EsimdEmulSys

// used for intermediate type in dp4a emulation
template <typename T1, typename T2> struct restype_ex {
private:
Expand Down Expand Up @@ -470,10 +464,11 @@ template <typename T> struct dwordtype;
template <> struct dwordtype<int> { static const bool value = true; };
template <> struct dwordtype<unsigned int> { static const bool value = true; };

template <unsigned int N1, unsigned int N2> struct ressize {
static const unsigned int size = (N1 > N2) ? N1 : N2;
static const bool conformable =
check_true < N1 % size == 0 && N2 % size == 0 > ::value;
};
} // namespace detail
} // namespace emu
} // namespace gpu
} // namespace INTEL
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

#endif
#endif // #ifndef __SYCL_DEVICE_ONLY__
102 changes: 44 additions & 58 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
#include <CL/sycl/INTEL/esimd/detail/esimd_types.hpp>
#include <CL/sycl/INTEL/esimd/detail/esimd_util.hpp>
#include <CL/sycl/INTEL/esimd/esimd_enum.hpp>
#include <CL/sycl/detail/accessor_impl.hpp>

#include <assert.h>
#include <cstdint>

#define __SIGD sycl::INTEL::gpu::detail
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned about the name here. Maybe something like this would be better?

namespace sigd = sycl::INTEL::gpu::detail;

Similar to what we have for csd namespace.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem with such short names in headers in the global namespace is that they can conflict with other code. {{__}} factors out user code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's true. Do you know if we have project guidelines for such cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only guideline I know is the llvm coding style.

Copy link
Contributor Author

@kbobrovs kbobrovs Apr 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why I did not use namespace __SIGD = sycl::INTEL::gpu::detail; instead is because it is in the global scope and can't be undef'ed, unlike a macro. Which means all subsequent headers and user code will see that declaration. With macro, I just #undef it once no longer needed.


// \brief __esimd_rdregion: region access intrinsic.
//
// @param T the element data type, one of i8, i16, i32, i64, half, float,
Expand Down Expand Up @@ -63,13 +64,13 @@
//
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset);
SYCL_EXTERNAL __SIGD::vector_type_t<T, M>
__esimd_rdregion(__SIGD::vector_type_t<T, N> Input, uint16_t Offset);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset);
SYCL_EXTERNAL __SIGD::vector_type_t<T, M>
__esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,
__SIGD::vector_type_t<uint16_t, M> Offset);

// __esimd_wrregion returns the updated vector with the region updated.
//
Expand Down Expand Up @@ -120,46 +121,28 @@ __esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
//
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
SYCL_EXTERNAL __SIGD::vector_type_t<T, N>
__esimd_wrregion(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal, uint16_t Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
SYCL_EXTERNAL __SIGD::vector_type_t<T, N>
__esimd_wrindirect(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal,
__SIGD::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace INTEL {
namespace gpu {
// TODO dependencies on the std SYCL concepts like images
// should be refactored in a separate header
class AccessorPrivateProxy {
public:
#ifdef __SYCL_DEVICE_ONLY__
template <typename AccessorTy>
static auto getNativeImageObj(const AccessorTy &Acc) {
return Acc.getNativeImageObj();
}
#else
template <typename AccessorTy>
static auto getImageRange(const AccessorTy &Acc) {
return Acc.getAccessRange();
}
static auto getElemSize(const sycl::detail::AccessorBaseHost &Acc) {
return Acc.getElemSize();
}
#endif
};
namespace detail {

/// read from a basic region of a vector, return a vector
template <typename BT, int BN, typename RTy>
vector_type_t<typename RTy::element_type, RTy::length>
ESIMD_INLINE readRegion(const vector_type_t<BT, BN> &Base, RTy Region) {
__SIGD::vector_type_t<typename RTy::element_type, RTy::length> ESIMD_INLINE
readRegion(const __SIGD::vector_type_t<BT, BN> &Base, RTy Region) {
using ElemTy = typename RTy::element_type;
auto Base1 = bitcast<ElemTy, BT, BN>(Base);
constexpr int Bytes = BN * sizeof(BT);
Expand All @@ -180,8 +163,8 @@ vector_type_t<typename RTy::element_type, RTy::length>

/// read from a nested region of a vector, return a vector
template <typename BT, int BN, typename T, typename U>
ESIMD_INLINE vector_type_t<typename T::element_type, T::length>
readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
ESIMD_INLINE __SIGD::vector_type_t<typename T::element_type, T::length>
readRegion(const __SIGD::vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
// parent-region type
using PaTy = typename shape_type<U>::type;
constexpr int BN1 = PaTy::length;
Expand Down Expand Up @@ -222,6 +205,7 @@ readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
}
}

} // namespace detail
} // namespace gpu
} // namespace INTEL
} // namespace sycl
Expand All @@ -233,37 +217,37 @@ readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
// optimization on simd object
//
template <typename T, int N>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_vload(const sycl::INTEL::gpu::vector_type_t<T, N> *ptr);
SYCL_EXTERNAL __SIGD::vector_type_t<T, N>
__esimd_vload(const __SIGD::vector_type_t<T, N> *ptr);

// vstore
//
// map to the backend vstore intrinsic, used by compiler to control
// optimization on simd object
template <typename T, int N>
SYCL_EXTERNAL void __esimd_vstore(sycl::INTEL::gpu::vector_type_t<T, N> *ptr,
sycl::INTEL::gpu::vector_type_t<T, N> vals);
SYCL_EXTERNAL void __esimd_vstore(__SIGD::vector_type_t<T, N> *ptr,
__SIGD::vector_type_t<T, N> vals);

template <typename T, int N>
SYCL_EXTERNAL uint16_t __esimd_any(sycl::INTEL::gpu::vector_type_t<T, N> src);
SYCL_EXTERNAL uint16_t __esimd_any(__SIGD::vector_type_t<T, N> src);

template <typename T, int N>
SYCL_EXTERNAL uint16_t __esimd_all(sycl::INTEL::gpu::vector_type_t<T, N> src);
SYCL_EXTERNAL uint16_t __esimd_all(__SIGD::vector_type_t<T, N> src);

#ifndef __SYCL_DEVICE_ONLY__

// Implementations of ESIMD intrinsics for the SYCL host device
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
SYCL_EXTERNAL __SIGD::vector_type_t<T, M>
__esimd_rdregion(__SIGD::vector_type_t<T, N> Input, uint16_t Offset) {
uint16_t EltOffset = Offset / sizeof(T);
assert(Offset % sizeof(T) == 0);

int NumRows = M / Width;
assert(M % Width == 0);

sycl::INTEL::gpu::vector_type_t<T, M> Result;
__SIGD::vector_type_t<T, M> Result;
int Index = 0;
for (int i = 0; i < NumRows; ++i) {
for (int j = 0; j < Width; ++j) {
Expand All @@ -274,10 +258,10 @@ __esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset) {
sycl::INTEL::gpu::vector_type_t<T, M> Result;
SYCL_EXTERNAL __SIGD::vector_type_t<T, M>
__esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,
__SIGD::vector_type_t<uint16_t, M> Offset) {
__SIGD::vector_type_t<T, M> Result;
for (int i = 0; i < M; ++i) {
uint16_t EltOffset = Offset[i] / sizeof(T);
assert(Offset[i] % sizeof(T) == 0);
Expand All @@ -289,17 +273,17 @@ __esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,

template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
SYCL_EXTERNAL __SIGD::vector_type_t<T, N>
__esimd_wrregion(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal, uint16_t Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask) {
uint16_t EltOffset = Offset / sizeof(T);
assert(Offset % sizeof(T) == 0);

int NumRows = M / Width;
assert(M % Width == 0);

sycl::INTEL::gpu::vector_type_t<T, N> Result = OldVal;
__SIGD::vector_type_t<T, N> Result = OldVal;
int Index = 0;
for (int i = 0; i < NumRows; ++i) {
for (int j = 0; j < Width; ++j) {
Expand All @@ -312,12 +296,12 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
SYCL_EXTERNAL __SIGD::vector_type_t<T, N>
__esimd_wrindirect(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal,
__SIGD::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask) {
sycl::INTEL::gpu::vector_type_t<T, N> Result = OldVal;
__SIGD::vector_type_t<T, N> Result = OldVal;
for (int i = 0; i < M; ++i) {
if (Mask[i]) {
uint16_t EltOffset = Offset[i] / sizeof(T);
Expand All @@ -330,3 +314,5 @@ __esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
}

#endif // __SYCL_DEVICE_ONLY__

#undef __SIGD
Loading