Skip to content

Commit 6094f1d

Browse files
committed
[clang][cuda] support __managed__ variables
Closes #147373
1 parent 17c7c2e commit 6094f1d

File tree

10 files changed

+292
-155
lines changed

10 files changed

+292
-155
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr {
15621562
def HIPManaged : InheritableAttr {
15631563
let Spellings = [GNU<"managed">, Declspec<"__managed__">];
15641564
let Subjects = SubjectList<[Var]>;
1565-
let LangOpts = [HIP];
1565+
let LangOpts = [HIP, CUDA];
15661566
let Documentation = [HIPManagedAttrDocs];
15671567
}
15681568

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
4242
StringRef Prefix;
4343

4444
private:
45-
llvm::IntegerType *IntTy, *SizeTy;
45+
llvm::IntegerType *IntTy, *SizeTy, *CharTy;
4646
llvm::Type *VoidTy;
4747
llvm::PointerType *PtrTy;
4848

@@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
231231
SizeTy = CGM.SizeTy;
232232
VoidTy = CGM.VoidTy;
233233
PtrTy = CGM.UnqualPtrTy;
234+
CharTy = CGM.CharTy;
234235

235236
if (CGM.getLangOpts().OffloadViaLLVM)
236237
Prefix = "llvm";
@@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
547548
}
548549

549550
// Replace the original variable Var with the address loaded from variable
550-
// ManagedVar populated by HIP runtime.
551+
// ManagedVar populated by HIP/CUDA runtime.
551552
static void replaceManagedVar(llvm::GlobalVariable *Var,
552553
llvm::GlobalVariable *ManagedVar) {
553554
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
555+
554556
for (auto &&VarUse : Var->uses()) {
555557
WorkList.push_back({VarUse.getUser()});
556558
}
@@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
661663
addUnderscoredPrefixToName("RegisterVar"));
662664
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
663665
// size_t, unsigned)
664-
llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
665-
PtrTy, VarSizeTy, IntTy};
666+
// void __cudaRegisterManagedVar(void **, void **, char *, const char *,
667+
// int, size_t, int, int)
668+
SmallVector<llvm::Type *, 8> RegisterManagedVarParams;
669+
if (CGM.getLangOpts().HIP)
670+
RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy};
671+
else
672+
RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy,
673+
IntTy, VarSizeTy, IntTy, IntTy};
674+
666675
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
667676
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
668677
addUnderscoredPrefixToName("RegisterManagedVar"));
@@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
693702
"HIP managed variables not transformed");
694703
auto *ManagedVar = CGM.getModule().getNamedGlobal(
695704
Var->getName().drop_back(StringRef(".managed").size()));
696-
llvm::Value *Args[] = {
697-
&GpuBinaryHandlePtr,
698-
ManagedVar,
699-
Var,
700-
VarName,
701-
llvm::ConstantInt::get(VarSizeTy, VarSize),
702-
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
705+
SmallVector<llvm::Value *, 8> Args;
706+
if (CGM.getLangOpts().HIP)
707+
Args = {&GpuBinaryHandlePtr,
708+
ManagedVar,
709+
Var,
710+
VarName,
711+
llvm::ConstantInt::get(VarSizeTy, VarSize),
712+
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
713+
else
714+
Args = {&GpuBinaryHandlePtr,
715+
ManagedVar,
716+
VarName,
717+
VarName,
718+
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
719+
llvm::ConstantInt::get(VarSizeTy, VarSize),
720+
llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
721+
llvm::ConstantInt::get(IntTy, 0)};
703722
if (!Var->isDeclaration())
704723
Builder.CreateCall(RegisterManagedVar, Args);
705724
} else {
@@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
965984
"__cudaRegisterFatBinaryEnd");
966985
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
967986
}
987+
// Call __cudaInitModule(GpuBinaryHandle) for managed variables
988+
for (auto &&Info : DeviceVars) {
989+
llvm::GlobalVariable *Var = Info.Var;
990+
if (!Var->isDeclaration() && Info.Flags.isManaged()) {
991+
llvm::FunctionCallee NvInitManagedRtWithModule =
992+
CGM.CreateRuntimeFunction(
993+
llvm::FunctionType::get(CharTy, PtrTy, false),
994+
"__cudaInitModule");
995+
CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle);
996+
break;
997+
}
998+
}
968999
} else {
9691000
// Generate a unique module ID.
9701001
SmallString<64> ModuleID;
@@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
11581189
// transformed managed variable. The transformed managed variable contains
11591190
// the address of managed memory which will be allocated by the runtime.
11601191
void CGNVCUDARuntime::transformManagedVars() {
1192+
// CUDA managed variables directly access in device code
1193+
if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice)
1194+
return;
11611195
for (auto &&Info : DeviceVars) {
11621196
llvm::GlobalVariable *Var = Info.Var;
11631197
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,19 +241,26 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
241241

242242
void NVPTXTargetCodeGenInfo::setTargetAttributes(
243243
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
244-
if (GV->isDeclaration())
245-
return;
244+
246245
const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
247246
if (VD) {
248247
if (M.getLangOpts().CUDA) {
249-
if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
248+
if (!GV->isDeclaration() &&
249+
VD->getType()->isCUDADeviceBuiltinSurfaceType())
250250
addNVVMMetadata(GV, "surface", 1);
251-
else if (VD->getType()->isCUDADeviceBuiltinTextureType())
251+
else if (!GV->isDeclaration() &&
252+
VD->getType()->isCUDADeviceBuiltinTextureType())
252253
addNVVMMetadata(GV, "texture", 1);
254+
// nvlink asserts managed attribute match in decl and def
255+
else if (VD->hasAttr<HIPManagedAttr>())
256+
addNVVMMetadata(GV, "managed", 1);
253257
return;
254258
}
255259
}
256260

261+
if (GV->isDeclaration())
262+
return;
263+
257264
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
258265
if (!FD)
259266
return;

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
#define __global__ __attribute__((global))
99
#define __host__ __attribute__((host))
1010
#define __shared__ __attribute__((shared))
11-
#if __HIP__
1211
#define __managed__ __attribute__((managed))
13-
#endif
1412
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
1513
#define __grid_constant__ __attribute__((grid_constant))
1614
#else

clang/test/CodeGenCUDA/anon-ns.cu

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,26 +34,26 @@
3434
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
3535
// CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
3636
// CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
37+
// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
3738
// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
3839
// CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
3940

4041
// COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
4142
// COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00"
4243

43-
// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
44-
// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]]
44+
// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
4545

4646
// COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00"
4747
// COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00"
4848
// COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00"
49-
// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
49+
// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
5050
// COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00"
5151
// COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00"
5252

5353
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]]
5454
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]]
5555
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]]
56-
// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
56+
// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
5757
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]]
5858
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]]
5959

@@ -67,9 +67,7 @@ namespace {
6767
struct X {};
6868
X x;
6969
auto lambda = [](){};
70-
#if __HIP__
7170
__managed__ int vm = 1;
72-
#endif
7371
__constant__ int vc = 2;
7472

7573
// C should not be externalized since it is used by device code only.
@@ -89,9 +87,7 @@ void test() {
8987

9088
// A, B, and tempVar<X> should be externalized since they are
9189
// used by host code.
92-
#if __HIP__
9390
getSymbol(&vm);
94-
#endif
9591
getSymbol(&vc);
9692
getSymbol(&vt<X>);
9793
}
Lines changed: 40 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
11
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
22
// RUN: -emit-llvm -o - -x hip %s \
3-
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
3+
// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s
44
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
55
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
6-
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
6+
// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s
77
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
88
// RUN: -emit-llvm -o - -x hip %s \
9-
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
9+
// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s
1010
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
1111
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
12-
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
12+
// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s
13+
1314
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
14-
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
15-
// RUN: | FileCheck -check-prefixes=CUDA %s
15+
// RUN: -emit-llvm -o - -x cuda %s \
16+
// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s
17+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
18+
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
19+
// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s
20+
21+
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
22+
// RUN: -emit-llvm -o - -x cuda %s \
23+
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
24+
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
25+
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
26+
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
1627

1728
#include "Inputs/cuda.h"
1829

@@ -24,58 +35,53 @@ __device__ int v1;
2435
// NORDC-H-DAG: @v2 = internal global i32 undef
2536
// RDC-H-DAG: @v2 = global i32 undef
2637
__constant__ int v2;
27-
// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
38+
// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
39+
// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4
2840
// NORDC-H-DAG: @v3 = internal externally_initialized global ptr null
2941
// RDC-H-DAG: @v3 = externally_initialized global ptr null
30-
#if __HIP__
3142
__managed__ int v3;
32-
#endif
3343

3444
// DEV-DAG: @ev1 = external addrspace(1) global i32
3545
// HOST-DAG: @ev1 = external global i32
3646
extern __device__ int ev1;
3747
// DEV-DAG: @ev2 = external addrspace(4) global i32
3848
// HOST-DAG: @ev2 = external global i32
3949
extern __constant__ int ev2;
40-
// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
50+
// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
51+
// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4
4152
// HOST-DAG: @ev3 = external externally_initialized global ptr
42-
#if __HIP__
4353
extern __managed__ int ev3;
44-
#endif
4554

4655
// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
47-
// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
56+
// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
57+
// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
4858
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
49-
// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
5059
static __device__ int sv1;
5160
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
52-
// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
61+
// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
62+
// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
5363
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
54-
// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
5564
static __constant__ int sv2;
56-
// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
57-
// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
65+
// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
66+
// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4
67+
// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
68+
// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4
5869
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null
59-
#if __HIP__
6070
static __managed__ int sv3;
61-
#endif
6271

6372
__device__ __host__ int work(int *x);
6473

6574
__device__ __host__ int fun1() {
66-
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
67-
#if __HIP__
68-
+ work(&ev3) + work(&sv3)
69-
#endif
70-
;
75+
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) +
76+
work(&ev3) + work(&sv3);
7177
}
7278

73-
// HOST: hipRegisterVar({{.*}}@v1
74-
// HOST: hipRegisterVar({{.*}}@v2
75-
// HOST: hipRegisterManagedVar({{.*}}@v3
76-
// HOST-NOT: hipRegisterVar({{.*}}@ev1
77-
// HOST-NOT: hipRegisterVar({{.*}}@ev2
78-
// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
79-
// HOST: hipRegisterVar({{.*}}@_ZL3sv1
80-
// HOST: hipRegisterVar({{.*}}@_ZL3sv2
81-
// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
79+
// HIP-H: hipRegisterVar({{.*}}@v1
80+
// HIP-H: hipRegisterVar({{.*}}@v2
81+
// HIP-H: hipRegisterManagedVar({{.*}}@v3
82+
// HIP-H-NOT: hipRegisterVar({{.*}}@ev1
83+
// HIP-H-NOT: hipRegisterVar({{.*}}@ev2
84+
// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3
85+
// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1
86+
// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2
87+
// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3

0 commit comments

Comments
 (0)