Skip to content

[OpenMP] Replace most GPU helpers with ones from <gpuintrin.h> #125771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions offload/DeviceRTL/include/DeviceTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#ifndef OMPTARGET_TYPES_H
#define OMPTARGET_TYPES_H

#include <gpuintrin.h>
#include <stddef.h>
#include <stdint.h>

Expand Down Expand Up @@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)

#define SHARED(NAME) \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This macro marked things uninitialised

[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];

// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
// now that's not the case.
#define THREAD_LOCAL(NAME) \
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]

// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
// does?
#define CONSTANT(NAME) \
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]

///}

#endif
4 changes: 2 additions & 2 deletions offload/DeviceRTL/src/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ using namespace ompx;
// This variable should be visible to the plugin so we override the default
// hidden visibility.
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceEnvironmentTy
CONSTANT(__omp_rtl_device_environment);
gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
__omp_rtl_device_environment;

uint32_t config::getAssumeTeamsOversubscription() {
return __omp_rtl_assume_teams_oversubscription;
Expand Down
100 changes: 16 additions & 84 deletions offload/DeviceRTL/src/DeviceUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,116 +15,48 @@
#include "Interface.h"
#include "Mapping.h"

using namespace ompx;

namespace impl {

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);

uint64_t ballotSync(uint64_t Mask, int32_t Pred);

/// AMDGCN Implementation
///
///{
#ifdef __AMDGPU__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = Self + LaneDelta;
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
#include <gpuintrin.h>

uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return Mask & __builtin_amdgcn_ballot_w64(Pred);
}

bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
}
#endif
///}

/// NVPTX Implementation
///
///{
#ifdef __NVPTX__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}

uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
}

bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }

#endif
///}
} // namespace impl
using namespace ompx;

uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
return (uint64_t(HighBits) << 32) | uint64_t(LowBits);
}

void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
impl::Unpack(Val, &LowBits, &HighBits);
static_assert(sizeof(unsigned long) == 8, "");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it really matter for literal values?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
static_assert(sizeof(unsigned long) == 8, "");
static_assert(sizeof(unsigned long) == 8, "size mismatch");

LowBits = static_cast<uint32_t>(Val & 0x00000000fffffffful);
HighBits = static_cast<uint32_t>((Val & 0xffffffff00000000ul) >> 32);
}

int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
int32_t Width) {
return impl::shuffle(Mask, Var, SrcLane, Width);
return __gpu_shuffle_idx_u32(Mask, Var, SrcLane, Width);
}

int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width) {
return impl::shuffleDown(Mask, Var, Delta, Width);
int32_t Self = mapping::getThreadIdInWarp();
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
return __gpu_shuffle_idx_u32(Mask, Index, Var, Width);
}

int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
int32_t Width) {
uint32_t Lo, Hi;
utils::unpack(Var, Lo, Hi);
Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
return utils::pack(Lo, Hi);
int32_t Self = mapping::getThreadIdInWarp();
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}

uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
return impl::ballotSync(Mask, Pred);
return __gpu_ballot(Mask, Pred);
}

bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }

extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
}

int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
Expand Down
Loading