Skip to content

[Clang][CUDA] Add support for __managed__ variables in non-RDC and default RDC mode #149716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr {
def HIPManaged : InheritableAttr {
let Spellings = [GNU<"managed">, Declspec<"__managed__">];
let Subjects = SubjectList<[Var]>;
let LangOpts = [HIP];
let LangOpts = [HIP, CUDA];
let Documentation = [HIPManagedAttrDocs];
}

Expand Down
56 changes: 45 additions & 11 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
StringRef Prefix;

private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::IntegerType *IntTy, *SizeTy, *CharTy;
llvm::Type *VoidTy;
llvm::PointerType *PtrTy;

Expand Down Expand Up @@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
CharTy = CGM.CharTy;

if (CGM.getLangOpts().OffloadViaLLVM)
Prefix = "llvm";
Expand Down Expand Up @@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
}

// Replace the original variable Var with the address loaded from variable
// ManagedVar populated by HIP runtime.
// ManagedVar populated by HIP/CUDA runtime.
static void replaceManagedVar(llvm::GlobalVariable *Var,
llvm::GlobalVariable *ManagedVar) {
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;

for (auto &&VarUse : Var->uses()) {
WorkList.push_back({VarUse.getUser()});
}
Expand Down Expand Up @@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
addUnderscoredPrefixToName("RegisterVar"));
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
// size_t, unsigned)
llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
PtrTy, VarSizeTy, IntTy};
// void __cudaRegisterManagedVar(void **, void **, char *, const char *,
// int, size_t, int, int)
SmallVector<llvm::Type *, 8> RegisterManagedVarParams;
if (CGM.getLangOpts().HIP)
RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy};
else
RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy,
IntTy, VarSizeTy, IntTy, IntTy};

llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
addUnderscoredPrefixToName("RegisterManagedVar"));
Expand Down Expand Up @@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
"HIP managed variables not transformed");
auto *ManagedVar = CGM.getModule().getNamedGlobal(
Var->getName().drop_back(StringRef(".managed").size()));
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
ManagedVar,
Var,
VarName,
llvm::ConstantInt::get(VarSizeTy, VarSize),
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
SmallVector<llvm::Value *, 8> Args;
if (CGM.getLangOpts().HIP)
Args = {&GpuBinaryHandlePtr,
ManagedVar,
Var,
VarName,
llvm::ConstantInt::get(VarSizeTy, VarSize),
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
else
Args = {&GpuBinaryHandlePtr,
ManagedVar,
VarName,
VarName,
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
llvm::ConstantInt::get(VarSizeTy, VarSize),
llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
llvm::ConstantInt::get(IntTy, 0)};
if (!Var->isDeclaration())
Builder.CreateCall(RegisterManagedVar, Args);
} else {
Expand Down Expand Up @@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
"__cudaRegisterFatBinaryEnd");
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
}
// Call __cudaInitModule(GpuBinaryHandle) for managed variables
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
if (!Var->isDeclaration() && Info.Flags.isManaged()) {
llvm::FunctionCallee NvInitManagedRtWithModule =
CGM.CreateRuntimeFunction(
llvm::FunctionType::get(CharTy, PtrTy, false),
"__cudaInitModule");
CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle);
break;
}
}
} else {
// Generate a unique module ID.
SmallString<64> ModuleID;
Expand Down Expand Up @@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
// transformed managed variable. The transformed managed variable contains
// the address of managed memory which will be allocated by the runtime.
void CGNVCUDARuntime::transformManagedVars() {
// CUDA managed variables directly access in device code
if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice)
return;
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
Expand Down
15 changes: 11 additions & 4 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,19 +241,26 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,

void NVPTXTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
if (GV->isDeclaration())
return;

const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
if (VD) {
if (M.getLangOpts().CUDA) {
if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
if (!GV->isDeclaration() &&
VD->getType()->isCUDADeviceBuiltinSurfaceType())
addNVVMMetadata(GV, "surface", 1);
else if (VD->getType()->isCUDADeviceBuiltinTextureType())
else if (!GV->isDeclaration() &&
VD->getType()->isCUDADeviceBuiltinTextureType())
addNVVMMetadata(GV, "texture", 1);
// nvlink asserts managed attribute match in decl and def
else if (VD->hasAttr<HIPManagedAttr>())
addNVVMMetadata(GV, "managed", 1);
return;
}
}

if (GV->isDeclaration())
return;

const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD)
return;
Expand Down
2 changes: 0 additions & 2 deletions clang/test/CodeGenCUDA/Inputs/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
#define __global__ __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
#if __HIP__
#define __managed__ __attribute__((managed))
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#else
Expand Down
12 changes: 4 additions & 8 deletions clang/test/CodeGenCUDA/anon-ns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,26 @@
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
// CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global

// COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
// COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00"

// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]]
// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]

// COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00"
// COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00"
// COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00"
// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
// COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00"
// COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00"

// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]]
// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]]

Expand All @@ -67,9 +67,7 @@ namespace {
struct X {};
X x;
auto lambda = [](){};
#if __HIP__
__managed__ int vm = 1;
#endif
__constant__ int vc = 2;

// C should not be externalized since it is used by device code only.
Expand All @@ -89,9 +87,7 @@ void test() {

// A, B, and tempVar<X> should be externalized since they are
// used by host code.
#if __HIP__
getSymbol(&vm);
#endif
getSymbol(&vc);
getSymbol(&vt<X>);
}
74 changes: 40 additions & 34 deletions clang/test/CodeGenCUDA/device-var-linkage.cu
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s

// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefixes=CUDA %s
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s

// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s

#include "Inputs/cuda.h"

Expand All @@ -24,58 +35,53 @@ __device__ int v1;
// NORDC-H-DAG: @v2 = internal global i32 undef
// RDC-H-DAG: @v2 = global i32 undef
__constant__ int v2;
// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4
// NORDC-H-DAG: @v3 = internal externally_initialized global ptr null
// RDC-H-DAG: @v3 = externally_initialized global ptr null
#if __HIP__
__managed__ int v3;
#endif

// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
extern __device__ int ev1;
// DEV-DAG: @ev2 = external addrspace(4) global i32
// HOST-DAG: @ev2 = external global i32
extern __constant__ int ev2;
// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4
// HOST-DAG: @ev3 = external externally_initialized global ptr
#if __HIP__
extern __managed__ int ev3;
#endif

// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
static __constant__ int sv2;
// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4
// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null
#if __HIP__
static __managed__ int sv3;
#endif

__device__ __host__ int work(int *x);

__device__ __host__ int fun1() {
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
#if __HIP__
+ work(&ev3) + work(&sv3)
#endif
;
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) +
work(&ev3) + work(&sv3);
}

// HOST: hipRegisterVar({{.*}}@v1
// HOST: hipRegisterVar({{.*}}@v2
// HOST: hipRegisterManagedVar({{.*}}@v3
// HOST-NOT: hipRegisterVar({{.*}}@ev1
// HOST-NOT: hipRegisterVar({{.*}}@ev2
// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
// HOST: hipRegisterVar({{.*}}@_ZL3sv1
// HOST: hipRegisterVar({{.*}}@_ZL3sv2
// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
// HIP-H: hipRegisterVar({{.*}}@v1
// HIP-H: hipRegisterVar({{.*}}@v2
// HIP-H: hipRegisterManagedVar({{.*}}@v3
// HIP-H-NOT: hipRegisterVar({{.*}}@ev1
// HIP-H-NOT: hipRegisterVar({{.*}}@ev2
// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3
// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1
// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2
// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3
Loading