Skip to content

Commit 57ce9e6

Browse files
committed
[OpenMP] Replace most GPU helpers with ones from <gpuintrin.h>
Summary: This patch cleans up the runtime by using the definitions from `<gpuintrin.h>` instead. This reduces complexity and makes it easier to port. I have left a handful leftover, atomicInc, shuffle, and the sleep calls. These are not easily replaced but I will work on it.
1 parent 6a2bf1d commit 57ce9e6

File tree

9 files changed

+68
-272
lines changed

9 files changed

+68
-272
lines changed

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef OMPTARGET_TYPES_H
1313
#define OMPTARGET_TYPES_H
1414

15+
#include <gpuintrin.h>
1516
#include <stddef.h>
1617
#include <stdint.h>
1718

@@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
155156
#define __PRAGMA(STR) _Pragma(#STR)
156157
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
157158

158-
#define SHARED(NAME) \
159-
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
160-
161-
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
162-
// now that's not the case.
163-
#define THREAD_LOCAL(NAME) \
164-
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
165-
166-
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
167-
// does?
168-
#define CONSTANT(NAME) \
169-
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
170-
171159
///}
172160

173161
#endif

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ using namespace ompx;
2828
// This variable should be visible to the plugin so we override the default
2929
// hidden visibility.
3030
[[gnu::used, gnu::retain, gnu::weak,
31-
gnu::visibility("protected")]] DeviceEnvironmentTy
32-
CONSTANT(__omp_rtl_device_environment);
31+
gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
32+
__omp_rtl_device_environment;
3333

3434
uint32_t config::getAssumeTeamsOversubscription() {
3535
return __omp_rtl_assume_teams_oversubscription;

offload/DeviceRTL/src/DeviceUtils.cpp

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "Interface.h"
1616
#include "Mapping.h"
1717

18+
#include <gpuintrin.h>
19+
1820
using namespace ompx;
1921

2022
namespace impl {
@@ -29,17 +31,12 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
2931
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
3032
}
3133

32-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
33-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
34-
int32_t Width);
35-
36-
uint64_t ballotSync(uint64_t Mask, int32_t Pred);
37-
3834
/// AMDGCN Implementation
3935
///
4036
///{
4137
#ifdef __AMDGPU__
4238

39+
// TODO: Move this to <gpuintrin.h>.
4340
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
4441
int Self = mapping::getThreadIdInWarp();
4542
int Index = SrcLane + (Self & ~(Width - 1));
@@ -53,15 +50,6 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
5350
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
5451
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
5552
}
56-
57-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
58-
return Mask & __builtin_amdgcn_ballot_w64(Pred);
59-
}
60-
61-
bool isSharedMemPtr(const void *Ptr) {
62-
return __builtin_amdgcn_is_shared(
63-
(const __attribute__((address_space(0))) void *)Ptr);
64-
}
6553
#endif
6654
///}
6755

@@ -70,6 +58,7 @@ bool isSharedMemPtr(const void *Ptr) {
7058
///{
7159
#ifdef __NVPTX__
7260

61+
// TODO: Move this to <gpuintrin.h>.
7362
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
7463
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
7564
}
@@ -78,13 +67,6 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
7867
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
7968
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
8069
}
81-
82-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
83-
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
84-
}
85-
86-
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
87-
8870
#endif
8971
///}
9072
} // namespace impl
@@ -117,10 +99,10 @@ int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
11799
}
118100

119101
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
120-
return impl::ballotSync(Mask, Pred);
102+
return __gpu_ballot(Mask, Pred);
121103
}
122104

123-
bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
105+
bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
124106

125107
extern "C" {
126108
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {

0 commit comments

Comments
 (0)