From 922c2d5a1713628f991f3ceb089dd245200aa98e Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Mon, 24 Jun 2024 14:47:42 -0700 Subject: [PATCH 1/6] [NVPTX] Basic support for "grid_constant" (#96125) - Adds a helper function for checking whether an argument is a [grid_constant](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#supported-properties). - Adds support for cvta.param using changes from https://github.com/llvm/llvm-project/pull/95289 - Supports escaped grid_constant pointers conservatively, by casting all uses to the generic address space with cvta.param. --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 6 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 1 + llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 77 ++++++--- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 143 ++++++++-------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 1 + .../CodeGen/NVPTX/lower-args-gridconstant.ll | 155 ++++++++++++++++++ 6 files changed, 297 insertions(+), 86 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 4320140526a43..15bf5da740abc 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1698,6 +1698,12 @@ def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable, IntrNoCallback], "llvm.nvvm.ptr.gen.to.param">; +// sm70+, PTX7.7+ +def int_nvvm_ptr_param_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], + [llvm_anyptr_ty], + [IntrNoMem, IntrSpeculatable, IntrNoCallback], + "llvm.nvvm.ptr.param.to.gen">; + // Move intrinsics, used in nvvm internally def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem], diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index aa1c2dcc9257f..f787237f38a0c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2684,6 +2684,7 @@ defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal> defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>; defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>; defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>; +defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>; defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>; defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index cde02c25c4834..e63c7a61c6f26 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -95,7 +95,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" @@ -336,8 +338,9 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, while (!ValuesToCheck.empty()) { Value *V = ValuesToCheck.pop_back_val(); if (!IsALoadChainInstr(V)) { - LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V - << "\n"); + LLVM_DEBUG(dbgs() << "Need a " + << (isParamGridConstant(*Arg) ? "cast " : "copy ") + << "of " << *Arg << " because of " << *V << "\n"); (void)Arg; return false; } @@ -366,27 +369,59 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, return; } - // Otherwise we have to create a temporary copy. const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); - AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); - // Set the alignment to alignment of the byval parameter. This is because, - // later load/stores assume that alignment, and we are going to replace - // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo()) - .value_or(DL.getPrefTypeAlign(StructType))); - Arg->replaceAllUsesWith(AllocA); - - Value *ArgInParam = new AddrSpaceCastInst( - Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), - FirstInst); - // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX - // addrspacecast preserves alignment. Since params are constant, this load is - // definitely not volatile. - LoadInst *LI = - new LoadInst(StructType, ArgInParam, Arg->getName(), - /*isVolatile=*/false, AllocA->getAlign(), FirstInst); - new StoreInst(LI, AllocA, FirstInst); + if (isParamGridConstant(*Arg)) { + // Writes to a grid constant are undefined behaviour. We do not need a + // temporary copy. When a pointer might have escaped, conservatively replace + // all of its uses (which might include a device function call) with a cast + // to the generic address space. + // TODO: only cast byval grid constant parameters at use points that need + // generic address (e.g., merging parameter pointers with other address + // space, or escaping to call-sites, inline-asm, memory), and use the + // parameter address space for normal loads. + IRBuilder<> IRB(&Func->getEntryBlock().front()); + + // Cast argument to param address space + auto *CastToParam = + cast(IRB.CreateAddrSpaceCast( + Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param")); + + // Cast param address to generic address space. We do not use an + // addrspacecast to generic here, because, LLVM considers `Arg` to be in the + // generic address space, and a `generic -> param` cast followed by a `param + // -> generic` cast will be folded away. The `param -> generic` intrinsic + // will be correctly lowered to `cvta.param`. + Value *CvtToGenCall = IRB.CreateIntrinsic( + IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen, + CastToParam, nullptr, CastToParam->getName() + ".gen"); + + Arg->replaceAllUsesWith(CvtToGenCall); + + // Do not replace Arg in the cast to param space + CastToParam->setOperand(0, Arg); + } else { + // Otherwise we have to create a temporary copy. + AllocaInst *AllocA = + new AllocaInst(StructType, AS, Arg->getName(), FirstInst); + // Set the alignment to alignment of the byval parameter. This is because, + // later load/stores assume that alignment, and we are going to replace + // the use of the byval parameter with this alloca instruction. + AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo()) + .value_or(DL.getPrefTypeAlign(StructType))); + Arg->replaceAllUsesWith(AllocA); + + Value *ArgInParam = new AddrSpaceCastInst( + Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM), + Arg->getName(), FirstInst); + // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX + // addrspacecast preserves alignment. Since params are constant, this load + // is definitely not volatile. + LoadInst *LI = + new LoadInst(StructType, ArgInParam, Arg->getName(), + /*isVolatile=*/false, AllocA->getAlign(), FirstInst); + new StoreInst(LI, AllocA, FirstInst); + } } void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 013afe916e86c..c2adcb61f1138 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -52,29 +52,46 @@ void clearAnnotationCache(const Module *Mod) { AC.Cache.erase(Mod); } -static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { +static void readIntVecFromMDNode(const MDNode *MetadataNode, + std::vector &Vec) { + for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) { + ConstantInt *Val = + mdconst::extract(MetadataNode->getOperand(i)); + Vec.push_back(Val->getZExtValue()); + } +} + +static void cacheAnnotationFromMD(const MDNode *MetadataNode, + key_val_pair_t &retval) { auto &AC = getAnnotationCache(); std::lock_guard Guard(AC.Lock); - assert(md && "Invalid mdnode for annotation"); - assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); + assert(MetadataNode && "Invalid mdnode for annotation"); + assert((MetadataNode->getNumOperands() % 2) == 1 && + "Invalid number of operands"); // start index = 1, to skip the global variable key // increment = 2, to skip the value for each property-value pairs - for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) { + for (unsigned i = 1, e = MetadataNode->getNumOperands(); i != e; i += 2) { // property - const MDString *prop = dyn_cast(md->getOperand(i)); + const MDString *prop = dyn_cast(MetadataNode->getOperand(i)); assert(prop && "Annotation property not a string"); + std::string Key = prop->getString().str(); // value - ConstantInt *Val = mdconst::dyn_extract(md->getOperand(i + 1)); - assert(Val && "Value operand not a constant int"); - - std::string keyname = prop->getString().str(); - if (retval.find(keyname) != retval.end()) - retval[keyname].push_back(Val->getZExtValue()); - else { - std::vector tmp; - tmp.push_back(Val->getZExtValue()); - retval[keyname] = tmp; + if (ConstantInt *Val = mdconst::dyn_extract( + MetadataNode->getOperand(i + 1))) { + retval[Key].push_back(Val->getZExtValue()); + } else if (MDNode *VecMd = + dyn_cast(MetadataNode->getOperand(i + 1))) { + // note: only "grid_constant" annotations support vector MDNodes. + // assert: there can only exist one unique key value pair of + // the form (string key, MDNode node). Operands of such a node + // shall always be unsigned ints. + if (retval.find(Key) == retval.end()) { + readIntVecFromMDNode(VecMd, retval[Key]); + continue; + } + } else { + llvm_unreachable("Value operand not a constant int or an mdnode"); } } } @@ -145,9 +162,9 @@ bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, bool isTexture(const Value &val) { if (const GlobalValue *gv = dyn_cast(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "texture", annot)) { - assert((annot == 1) && "Unexpected annotation on a texture symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "texture", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a texture symbol"); return true; } } @@ -156,70 +173,67 @@ bool isTexture(const Value &val) { bool isSurface(const Value &val) { if (const GlobalValue *gv = dyn_cast(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "surface", annot)) { - assert((annot == 1) && "Unexpected annotation on a surface symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "surface", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a surface symbol"); return true; } } return false; } -bool isSampler(const Value &val) { - const char *AnnotationName = "sampler"; - - if (const GlobalValue *gv = dyn_cast(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, AnnotationName, annot)) { - assert((annot == 1) && "Unexpected annotation on a sampler symbol"); - return true; - } - } - if (const Argument *arg = dyn_cast(&val)) { - const Function *func = arg->getParent(); - std::vector annot; - if (findAllNVVMAnnotation(func, AnnotationName, annot)) { - if (is_contained(annot, arg->getArgNo())) +static bool argHasNVVMAnnotation(const Value &Val, + const std::string &Annotation, + const bool StartArgIndexAtOne = false) { + if (const Argument *Arg = dyn_cast(&Val)) { + const Function *Func = Arg->getParent(); + std::vector Annot; + if (findAllNVVMAnnotation(Func, Annotation, Annot)) { + const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0; + if (is_contained(Annot, BaseOffset + Arg->getArgNo())) { return true; + } } } return false; } -bool isImageReadOnly(const Value &val) { - if (const Argument *arg = dyn_cast(&val)) { - const Function *func = arg->getParent(); - std::vector annot; - if (findAllNVVMAnnotation(func, "rdoimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; +bool isParamGridConstant(const Value &V) { + if (const Argument *Arg = dyn_cast(&V)) { + // "grid_constant" counts argument indices starting from 1 + if (Arg->hasByValAttr() && + argHasNVVMAnnotation(*Arg, "grid_constant", /*StartArgIndexAtOne*/true)) { + assert(isKernelFunction(*Arg->getParent()) && + "only kernel arguments can be grid_constant"); + return true; } } return false; } -bool isImageWriteOnly(const Value &val) { - if (const Argument *arg = dyn_cast(&val)) { - const Function *func = arg->getParent(); - std::vector annot; - if (findAllNVVMAnnotation(func, "wroimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; +bool isSampler(const Value &val) { + const char *AnnotationName = "sampler"; + + if (const GlobalValue *gv = dyn_cast(&val)) { + unsigned Annot; + if (findOneNVVMAnnotation(gv, AnnotationName, Annot)) { + assert((Annot == 1) && "Unexpected annotation on a sampler symbol"); + return true; } } - return false; + return argHasNVVMAnnotation(val, AnnotationName); +} + +bool isImageReadOnly(const Value &val) { + return argHasNVVMAnnotation(val, "rdoimage"); +} + +bool isImageWriteOnly(const Value &val) { + return argHasNVVMAnnotation(val, "wroimage"); } bool isImageReadWrite(const Value &val) { - if (const Argument *arg = dyn_cast(&val)) { - const Function *func = arg->getParent(); - std::vector annot; - if (findAllNVVMAnnotation(func, "rdwrimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; - } - } - return false; + return argHasNVVMAnnotation(val, "rdwrimage"); } bool isImage(const Value &val) { @@ -228,9 +242,9 @@ bool isImage(const Value &val) { bool isManaged(const Value &val) { if(const GlobalValue *gv = dyn_cast(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "managed", annot)) { - assert((annot == 1) && "Unexpected annotation on a managed symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "managed", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a managed symbol"); return true; } } @@ -290,8 +304,7 @@ bool getMaxNReg(const Function &F, unsigned &x) { bool isKernelFunction(const Function &F) { unsigned x = 0; - bool retval = findOneNVVMAnnotation(&F, "kernel", x); - if (!retval) { + if (!findOneNVVMAnnotation(&F, "kernel", x)) { // There is no NVVM metadata, check the calling convention return F.getCallingConv() == CallingConv::PTX_Kernel; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 2872db9fa2131..0ed9d8873b24f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -60,6 +60,7 @@ bool getMaxClusterRank(const Function &, unsigned &); bool getMinCTASm(const Function &, unsigned &); bool getMaxNReg(const Function &, unsigned &); bool isKernelFunction(const Function &); +bool isParamGridConstant(const Value &); MaybeAlign getAlign(const Function &, unsigned); MaybeAlign getAlign(const CallInst &, unsigned); diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll new file mode 100644 index 0000000000000..46f54e0e6f4d4 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT +; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX + +define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { +; PTX-LABEL: grid_const_int( +; PTX-NOT: ld.u32 +; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0]; +; +; OPT-LABEL: define void @grid_const_int( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) { +; OPT-NOT: alloca +; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 +; + %tmp = load i32, ptr %input1, align 4 + %add = add i32 %tmp, %input2 + store i32 %add, ptr %out + ret void +} + +%struct.s = type { i32, i32 } + +define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ +; PTX-LABEL: grid_const_struct( +; PTX: { +; PTX-NOT: ld.u32 +; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0]; +; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4]; +; +; OPT-LABEL: define void @grid_const_struct( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) { +; OPT-NOT: alloca +; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 +; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 +; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 +; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 +; + %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 + %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 + %int1 = load i32, ptr %gep1 + %int2 = load i32, ptr %gep2 + %add = add i32 %int1, %int2 + store i32 %add, ptr %out + ret void +} + +define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { +; PTX-LABEL: grid_const_escape( +; PTX: { +; PTX-NOT: .local +; PTX: cvta.param.{{.*}} +; OPT-LABEL: define void @grid_const_escape( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) { +; OPT-NOT: alloca [[STRUCT_S]] +; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) +; + %call = call i32 @escape(ptr %input) + ret void +} + +define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { +; PTX-LABEL: multiple_grid_const_escape( +; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0; +; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2; +; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]]; +; PTX: cvta.param.{{.*}} [[RD4:%.*]], [[RD3]]; +; PTX: mov.u64 [[RD5:%.*]], [[RD1]]; +; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD5]]; +; PTX: { +; PTX: st.param.b64 [param0+0], [[RD6]]; +; PTX: st.param.b64 [param2+0], [[RD4]]; +; +; OPT-LABEL: define void @multiple_grid_const_escape( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) { +; OPT-NOT: alloca i32 +; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-NOT: alloca [[STRUCT_S]] +; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, ptr [[B_PARAM_GEN]]) +; + %a.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) + ret void +} + +define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { +; PTX-LABEL: grid_const_memory_escape( +; PTX-NOT: .local +; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0; +; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]]; +; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]]; +; +; OPT-LABEL: define void @grid_const_memory_escape( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) { +; OPT-NOT: alloca [[STRUCT_S]] +; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8 +; + store ptr %input, ptr %addr, align 8 + ret void +} + +define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { +; PTX-LABEL: grid_const_inlineasm_escape( +; PTX-NOT .local +; PTX: cvta.param.u64 [[RD2:%.*]], {{.*}} +; PTX: add.{{.*}} [[RD3:%.*]], [[RD2]], 4; +; PTX: add.s64 [[RD1:%.*]], [[RD2]], [[RD3]]; +; +; OPT-LABEL: define void @grid_const_inlineasm_escape( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) { +; OPT-NOT: alloca [[STRUCT_S]] +; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 0 +; OPT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 1 +; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 +; + %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 + %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 + %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 + store i64 %1, ptr %result, align 8 + ret void +} + +declare dso_local ptr @escape(ptr) local_unnamed_addr +declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11} + +!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1} +!1 = !{i32 1} + +!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3} +!3 = !{i32 1} + +!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5} +!5 = !{i32 1} + +!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7} +!7 = !{i32 1, i32 3} + +!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9} +!9 = !{i32 1} + +!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11} +!11 = !{i32 1} From 4a9216f8b09a60af0a9c45adac7e8acffdbab1bc Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 27 Jun 2024 17:37:21 +0100 Subject: [PATCH 2/6] [SYCL][NVPTX] Emit 'grid_constant' NVVM annotations for by-val kernel params Also fix up the DeadArgumentElimination passes to correctly preserve the annotations; when removing arguments from functions, dead parameters need pruned and alive ones may need their values shifted down by the number of dead arguments that came before them. --- clang/lib/CodeGen/Targets/NVPTX.cpp | 133 ++++++++++++++++++ clang/test/CodeGenSYCL/nvvm-annotations.cpp | 34 +++++ .../Transforms/IPO/DeadArgumentElimination.h | 3 +- .../IPO/DeadArgumentElimination.cpp | 72 +++++++++- .../DeadArgElim/nvvm-annotations.ll | 113 +++++++++++++++ 5 files changed, 351 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGenSYCL/nvvm-annotations.cpp create mode 100644 llvm/test/Transforms/DeadArgElim/nvvm-annotations.ll diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 252a83e5b81bb..921dca6adf72d 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/Cuda.h" #include "llvm/IR/IntrinsicsNVPTX.h" using namespace clang; @@ -80,6 +81,9 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo { static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name, int Operand); + static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name, + const std::vector &Operands); + private: static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst, LValue Src) { @@ -218,6 +222,98 @@ Address NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, llvm_unreachable("NVPTX does not support varargs"); } +// Get current CudaArch and ignore any unknown values +// Copied from CGOpenMPRuntimeGPU +static CudaArch getCudaArch(CodeGenModule &CGM) { + if (!CGM.getTarget().hasFeature("ptx")) + return CudaArch::UNKNOWN; + for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { + if (Feature.getValue()) { + CudaArch Arch = StringToCudaArch(Feature.getKey()); + if (Arch != CudaArch::UNKNOWN) + return Arch; + } + } + return CudaArch::UNKNOWN; +} + +static bool supportsGridConstant(CudaArch Arch) { + switch (Arch) { + case CudaArch::SM_70: + case CudaArch::SM_72: + case CudaArch::SM_75: + case CudaArch::SM_80: + case CudaArch::SM_86: + case CudaArch::SM_87: + case CudaArch::SM_89: + case CudaArch::SM_90: + case CudaArch::SM_90a: + return true; + case CudaArch::UNKNOWN: + case CudaArch::UNUSED: + case CudaArch::SM_20: + case CudaArch::SM_21: + case CudaArch::SM_30: + case CudaArch::SM_32_: + case CudaArch::SM_35: + case CudaArch::SM_37: + case CudaArch::SM_50: + case CudaArch::SM_52: + case CudaArch::SM_53: + case CudaArch::SM_60: + case CudaArch::SM_61: + case CudaArch::SM_62: + return false; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX602: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX705: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX805: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + case CudaArch::GFX904: + case CudaArch::GFX906: + case CudaArch::GFX908: + case CudaArch::GFX909: + case CudaArch::GFX90a: + case CudaArch::GFX90c: + case CudaArch::GFX940: + case CudaArch::GFX941: + case CudaArch::GFX942: + case CudaArch::GFX1010: + case CudaArch::GFX1011: + case CudaArch::GFX1012: + case CudaArch::GFX1013: + case CudaArch::GFX1030: + case CudaArch::GFX1031: + case CudaArch::GFX1032: + case CudaArch::GFX1033: + case CudaArch::GFX1034: + case CudaArch::GFX1035: + case CudaArch::GFX1036: + case CudaArch::GFX1100: + case CudaArch::GFX1101: + case CudaArch::GFX1102: + case CudaArch::GFX1103: + case CudaArch::GFX1150: + case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: + case CudaArch::Generic: + case CudaArch::LAST: + llvm_unreachable("unhandled CudaArch"); + } +} + void NVPTXTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (GV->isDeclaration()) @@ -248,6 +344,21 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( addNVVMMetadata(F, "kernel", 1); // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); + + if (supportsGridConstant(getCudaArch(M))) { + // Add grid_constant annotations to all relevant kernel-function + // parameters. We can guarantee that in SYCL, all by-val kernel + // parameters are "grid_constant". + std::vector GridConstantParamIdxs; + for (auto [Idx, Arg] : llvm::enumerate(F->args())) { + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) { + // Note - the parameter indices are numbered from 1. + GridConstantParamIdxs.push_back(Idx + 1); + } + } + if (!GridConstantParamIdxs.empty()) + addNVVMMetadata(F, "grid_constant", GridConstantParamIdxs); + } } bool HasMaxWorkGroupSize = false; bool HasMinWorkGroupPerCU = false; @@ -329,6 +440,28 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV, MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } +void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV, + StringRef Name, + const std::vector &Operands) { + llvm::Module *M = GV->getParent(); + llvm::LLVMContext &Ctx = M->getContext(); + + // Get "nvvm.annotations" metadata node + llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); + + llvm::SmallVector MDOps; + for (int Op : Operands) { + MDOps.push_back(llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Op))); + } + auto *OpList = llvm::MDNode::get(Ctx, MDOps); + + llvm::Metadata *MDVals[] = {llvm::ConstantAsMetadata::get(GV), + llvm::MDString::get(Ctx, Name), OpList}; + // Append metadata to nvvm.annotations + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); +} + bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { return false; } diff --git a/clang/test/CodeGenSYCL/nvvm-annotations.cpp b/clang/test/CodeGenSYCL/nvvm-annotations.cpp new file mode 100644 index 0000000000000..858648d901fb7 --- /dev/null +++ b/clang/test/CodeGenSYCL/nvvm-annotations.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -fno-sycl-force-inline-kernel-lambda -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -target-cpu sm_70 -disable-llvm-passes -sycl-std=2020 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,GRIDCONST +// RUN: %clang_cc1 -fno-sycl-force-inline-kernel-lambda -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -target-cpu sm_70 -disable-llvm-passes -sycl-std=2020 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,GRIDCONST + +// RUN: %clang_cc1 -fno-sycl-force-inline-kernel-lambda -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -target-cpu sm_60 -disable-llvm-passes -sycl-std=2020 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,NOGRIDCONST +// RUN: %clang_cc1 -fno-sycl-force-inline-kernel-lambda -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -target-cpu sm_60 -disable-llvm-passes -sycl-std=2020 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,NOGRIDCONST + +// Tests that certain SYCL kernel parameters are annotated with "grid_constant" for supported microarchitectures. + +#include "sycl.hpp" + +using namespace sycl; + +int main() { + queue q; + + struct S { + int a; + } s; + + q.submit([&](handler &h) { + // CHECK: define{{.*}} void @[[FUNC1:.*kernel_grid_const_params]](ptr noundef byval(%struct.S) align 4 %_arg_s) + h.single_task([=]() { (void) s;}); + }); + + return 0; +} + +// Don't emit grid_constant annotations for older architectures. +// NOGRIDCONST-NOT: "grid_constant" + +// This isn't stable in general, as it depends on the order of the captured +// parameters, but in this case there's only one parameter so we know it's 1. +// GRIDCONST-DAG: = !{ptr @[[FUNC1]], !"grid_constant", [[MD:\![0-9]+]]} +// GRIDCONST-DAG: [[MD]] = !{i32 1} diff --git a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h index 0b67319e3276a..094d716587205 100644 --- a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h +++ b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h @@ -145,7 +145,8 @@ class DeadArgumentEliminationPass bool removeDeadArgumentsFromCallers(Function &F); void propagateVirtMustcallLiveness(const Module &M); - void UpdateNVPTXMetadata(Module &M, Function *F, Function *NF); + void UpdateNVPTXMetadata(Module &M, Function *F, Function *NF, + const SmallVectorImpl &ArgAlive); llvm::DenseSet NVPTXKernelSet; bool IsNVPTXKernel(const Function *F) { return NVPTXKernelSet.contains(F); }; diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 48b46b4b151f3..c47fbffe16619 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -1171,7 +1171,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { NF->addMetadata(KindID, *Node); if (IsNVPTXKernel(F)) - UpdateNVPTXMetadata(*(F->getParent()), F, NF); + UpdateNVPTXMetadata(*(F->getParent()), F, NF, ArgAlive); // If either the return value(s) or argument(s) are removed, then probably the // function does not follow standard calling conventions anymore. Hence, add @@ -1249,8 +1249,9 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M, return PreservedAnalyses::none(); } -void DeadArgumentEliminationPass::UpdateNVPTXMetadata(Module &M, Function *F, - Function *NF) { +void DeadArgumentEliminationPass::UpdateNVPTXMetadata( + Module &M, Function *F, Function *NF, + const SmallVectorImpl &ArgAlive) { auto *NvvmMetadata = M.getNamedMetadata("nvvm.annotations"); if (!NvvmMetadata) @@ -1268,5 +1269,70 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata(Module &M, Function *F, continue; // Update the metadata with the new function MetadataNode->replaceOperandWith(0, llvm::ConstantAsMetadata::get(NF)); + + // Carefully update any and all grid_constant annotations, since those are + // denoted parameter indices, which may have changed + for (unsigned i = 1; i < MetadataNode->getNumOperands() - 1; i += 2) { + if (auto *Type = dyn_cast(MetadataNode->getOperand(i)); + Type && Type->getString() == "grid_constant") { + LLVMContext &Ctx = NF->getContext(); + LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - updating nvvm " + "grid_constant annotations for fn: " + << NF->getName() << "\n"); + // The 'value' operand is a list of integers denoting parameter indices + auto *OldGridConstParamIdxs = + dyn_cast(MetadataNode->getOperand(i + 1)); + if (!OldGridConstParamIdxs) + continue; + // For each parameter that's identified as a grid_constant, count how + // many arguments before that position are dead, and shift the number + // down by that amount. + // Note that there's no guaranteed order to the parameter indices, so + // there's fewer 'smart' things like counting up incrementally as we go. + SmallVector NewGridConstParamOps; + for (const auto &Op : OldGridConstParamIdxs->operands()) { + auto *ParamIdx = mdconst::dyn_extract(Op); + // If the operand's not a constant, or its constant value is not + // within the range of the old function's parameter list (note - it + // counts from 1), it's not well-defined. Just strip it out for + // safety. + if (!ParamIdx || ParamIdx->isZero() || + ParamIdx->getZExtValue() > F->arg_size()) + continue; + + size_t OldParamIdx = ParamIdx->getZExtValue() - 1; + // If the parameter is no longer alive, it's definitely not a + // grid_constant. Strip it out. + if (!ArgAlive[OldParamIdx]) + continue; + + unsigned ShiftDownAmt = 0; + for (unsigned i = 0; i < std::min(F->arg_size(), OldParamIdx); i++) { + if (!ArgAlive[i]) + ShiftDownAmt++; + } + NewGridConstParamOps.push_back( + ConstantAsMetadata::get(ConstantInt::get( + Type::getInt32Ty(Ctx), OldParamIdx - ShiftDownAmt + 1))); + } + + // Update the metadata with the new grid_constant information + MDNode *NewGridConstParamIdxs = MDNode::get(Ctx, NewGridConstParamOps); + + LLVM_DEBUG(dbgs() << " * updating old annotation {"; + auto PrintList = + [](const MDNode *MD) { + for (const auto &O : MD->operands()) + if (const auto *ParamNo = + mdconst::dyn_extract(O)) + dbgs() << ParamNo->getZExtValue() << ","; + }; + PrintList(OldGridConstParamIdxs); + dbgs() << "} to new annotation {"; + PrintList(NewGridConstParamIdxs); dbgs() << "}\n";); + + MetadataNode->replaceOperandWith(i + 1, NewGridConstParamIdxs); + } + } } } diff --git a/llvm/test/Transforms/DeadArgElim/nvvm-annotations.ll b/llvm/test/Transforms/DeadArgElim/nvvm-annotations.ll new file mode 100644 index 0000000000000..0a36eaf4043b3 --- /dev/null +++ b/llvm/test/Transforms/DeadArgElim/nvvm-annotations.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=deadargelim -S | FileCheck %s + +define internal void @test1(i32 %v, ptr byval(i32) %DEADARG1, ptr %p) { +; CHECK-LABEL: define internal void @test1( +; CHECK-SAME: i32 [[V:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 +; CHECK-NEXT: ret void +; + store i32 %v, ptr %p + ret void +} + +define internal void @test2(ptr byval(i32) %DEADARG1, ptr byval(i32) %p) { +; CHECK-LABEL: define internal void @test2( +; CHECK-SAME: ptr byval(i32) [[P:%.*]]) { +; CHECK-NEXT: store i32 0, ptr [[P]], align 4 +; CHECK-NEXT: ret void +; + store i32 0, ptr %p + ret void +} + +define internal void @test3(ptr byval(i32) %DEADARG1, i32 %v, ptr byval(i32) %p) { +; CHECK-LABEL: define internal void @test3( +; CHECK-SAME: i32 [[V:%.*]], ptr byval(i32) [[P:%.*]]) { +; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 +; CHECK-NEXT: ret void +; + store i32 %v, ptr %p + ret void +} + +define internal void @test4(ptr byval(i32) %p, i32 %v, ptr byval(i32) %DEADARG) { +; CHECK-LABEL: define internal void @test4( +; CHECK-SAME: ptr byval(i32) [[P:%.*]], i32 [[V:%.*]]) { +; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 +; CHECK-NEXT: ret void +; + store i32 %v, ptr %p + ret void +} + +define internal void @test5(ptr byval(i32) %p, i32 %x, ptr byval(i32) %DEADARG1, ptr byval(i32) %DEADARG2, i32 %y, ptr byval(i32) %q) { +; CHECK-LABEL: define internal void @test5( +; CHECK-SAME: ptr byval(i32) [[P:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]], ptr byval(i32) [[Q:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = add i32 [[X]], [[Y]] +; CHECK-NEXT: store i32 [[T]], ptr [[P]], align 4 +; CHECK-NEXT: store i32 [[T]], ptr [[Q]], align 4 +; CHECK-NEXT: ret void +; + %t = add i32 %x, %y + store i32 %t, ptr %p + store i32 %t, ptr %q + ret void +} + +!nvvm.annotations = !{ + !0, !1, + !3, !4, !6, + !8, !9, !11, + !13, !14, !16, + !18, !19 +} + +; Note - also test various permutations of the parameter lists, as they are not +; specified to be in any particular order (e.g., consecutive). +!0 = !{ptr @test1, !"kernel", i32 1} +!1 = !{ptr @test1, !"grid_constant", !2} +!2 = !{i32 2} + +!3 = !{ptr @test2, !"kernel", i32 1} +!4 = !{ptr @test2, !"grid_constant", !5} +!5 = !{i32 1, i32 2} +!6 = !{ptr @test2, !"grid_constant", !7} +!7 = !{i32 2, i32 1} + +!8 = !{ptr @test3, !"kernel", i32 1} +!9 = !{ptr @test3, !"grid_constant", !10} +!10 = !{i32 1, i32 3} +!11 = !{ptr @test3, !"grid_constant", !12} +!12 = !{i32 3, i32 1} + +!13 = !{ptr @test4, !"kernel", i32 1} +!14 = !{ptr @test4, !"grid_constant", !15} +!15 = !{i32 1, i32 3} +!16 = !{ptr @test4, !"grid_constant", !17} +!17 = !{i32 3, i32 1} + +!18 = !{ptr @test5, !"kernel", i32 1} +!19 = !{ptr @test5, !"grid_constant", !20, !"grid_constant", !21, !"grid_constant", !22} +!20 = !{i32 1, i32 3, i32 4, i32 6} +!21 = !{i32 3, i32 1, i32 4, i32 6} +!22 = !{i32 3, i32 1, i32 6, i32 4} +;. +; CHECK: [[META0:![0-9]+]] = !{ptr @test1, !"kernel", i32 1} +; CHECK: [[META1:![0-9]+]] = !{ptr @test1, !"grid_constant", [[META2:![0-9]+]]} +; CHECK: [[META2]] = !{} +; CHECK: [[META3:![0-9]+]] = !{ptr @test2, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{ptr @test2, !"grid_constant", [[META5:![0-9]+]]} +; CHECK: [[META5]] = !{i32 1} +; CHECK: [[META6:![0-9]+]] = distinct !{ptr @test2, !"grid_constant", [[META5]]} +; CHECK: [[META7:![0-9]+]] = !{ptr @test3, !"kernel", i32 1} +; CHECK: [[META8:![0-9]+]] = !{ptr @test3, !"grid_constant", [[META9:![0-9]+]]} +; CHECK: [[META9]] = !{i32 2} +; CHECK: [[META10:![0-9]+]] = distinct !{ptr @test3, !"grid_constant", [[META9]]} +; CHECK: [[META11:![0-9]+]] = !{ptr @test4, !"kernel", i32 1} +; CHECK: [[META12:![0-9]+]] = !{ptr @test4, !"grid_constant", [[META5]]} +; CHECK: [[META13:![0-9]+]] = distinct !{ptr @test4, !"grid_constant", [[META5]]} +; CHECK: [[META14:![0-9]+]] = !{ptr @test5, !"kernel", i32 1} +; CHECK: [[META15:![0-9]+]] = !{ptr @test5, !"grid_constant", [[META16:![0-9]+]], !"grid_constant", [[META16]], !"grid_constant", [[META16]]} +; CHECK: [[META16]] = !{i32 1, i32 4} +;. From f8dc5d39802bb0e75c90959d6e63e2374eca5da2 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 1 Jul 2024 12:33:04 +0100 Subject: [PATCH 3/6] Address feedback: simplify code; guard on SYCLIsDevice --- clang/lib/CodeGen/Targets/NVPTX.cpp | 79 ++--------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 921dca6adf72d..6288f3d7b12e1 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -238,80 +238,8 @@ static CudaArch getCudaArch(CodeGenModule &CGM) { } static bool supportsGridConstant(CudaArch Arch) { - switch (Arch) { - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - case CudaArch::SM_87: - case CudaArch::SM_89: - case CudaArch::SM_90: - case CudaArch::SM_90a: - return true; - case CudaArch::UNKNOWN: - case CudaArch::UNUSED: - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32_: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return false; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX940: - case CudaArch::GFX941: - case CudaArch::GFX942: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1013: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::GFX1034: - case CudaArch::GFX1035: - case CudaArch::GFX1036: - case CudaArch::GFX1100: - case CudaArch::GFX1101: - case CudaArch::GFX1102: - case CudaArch::GFX1103: - case CudaArch::GFX1150: - case CudaArch::GFX1151: - case CudaArch::GFX1200: - case CudaArch::GFX1201: - case CudaArch::Generic: - case CudaArch::LAST: - llvm_unreachable("unhandled CudaArch"); - } + assert(IsNVIDIAGpuArch(Arch) && "Unexpected architecture"); + return Arch >= CudaArch::SM_70; } void NVPTXTargetCodeGenInfo::setTargetAttributes( @@ -345,7 +273,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); - if (supportsGridConstant(getCudaArch(M))) { + if (M.getLangOpts().SYCLIsDevice && + supportsGridConstant(getCudaArch(M))) { // Add grid_constant annotations to all relevant kernel-function // parameters. We can guarantee that in SYCL, all by-val kernel // parameters are "grid_constant". From eb07bc807a46d2b8e17d437af241c81379fdbede Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 1 Jul 2024 15:12:44 +0100 Subject: [PATCH 4/6] add fix for unknown --- clang/lib/CodeGen/Targets/NVPTX.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 6288f3d7b12e1..56c4efe4bcf1d 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -238,7 +238,9 @@ static CudaArch getCudaArch(CodeGenModule &CGM) { } static bool supportsGridConstant(CudaArch Arch) { - assert(IsNVIDIAGpuArch(Arch) && "Unexpected architecture"); + assert((Arch == CudaArch::UNKNOWN || IsNVIDIAGpuArch(Arch)) && + "Unexpected architecture"); + static_assert(CudaArch::UNKNOWN < CudaArch::SM_70); return Arch >= CudaArch::SM_70; } From 3e5e69352f66cfd968cd4c15064f0632ab1e645a Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 8 Jul 2024 10:27:02 +0100 Subject: [PATCH 5/6] address feedback; i->I and dyn_cast->cast --- llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index c47fbffe16619..b7bea8b17f2e6 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -1261,7 +1261,7 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata( const auto &FuncOperand = MetadataNode->getOperand(0); if (!FuncOperand) continue; - auto FuncConstant = dyn_cast(FuncOperand); + auto *FuncConstant = dyn_cast(FuncOperand); if (!FuncConstant) continue; auto *Func = dyn_cast(FuncConstant->getValue()); @@ -1272,8 +1272,8 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata( // Carefully update any and all grid_constant annotations, since those are // denoted parameter indices, which may have changed - for (unsigned i = 1; i < MetadataNode->getNumOperands() - 1; i += 2) { - if (auto *Type = dyn_cast(MetadataNode->getOperand(i)); + for (unsigned I = 1; I < MetadataNode->getNumOperands() - 1; I += 2) { + if (auto *Type = dyn_cast(MetadataNode->getOperand(I)); Type && Type->getString() == "grid_constant") { LLVMContext &Ctx = NF->getContext(); LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - updating nvvm " @@ -1281,9 +1281,7 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata( << NF->getName() << "\n"); // The 'value' operand is a list of integers denoting parameter indices auto *OldGridConstParamIdxs = - dyn_cast(MetadataNode->getOperand(i + 1)); - if (!OldGridConstParamIdxs) - continue; + cast(MetadataNode->getOperand(I + 1)); // For each parameter that's identified as a grid_constant, count how // many arguments before that position are dead, and shift the number // down by that amount. @@ -1331,7 +1329,7 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata( dbgs() << "} to new annotation {"; PrintList(NewGridConstParamIdxs); dbgs() << "}\n";); - MetadataNode->replaceOperandWith(i + 1, NewGridConstParamIdxs); + MetadataNode->replaceOperandWith(I + 1, NewGridConstParamIdxs); } } } From 93d5f3b3f3011a96f6a59a0c4a74269418ff33de Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 9 Jul 2024 16:59:22 +0100 Subject: [PATCH 6/6] update cast/assert method --- llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index b7bea8b17f2e6..2bbb68a885061 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -1280,8 +1280,10 @@ void DeadArgumentEliminationPass::UpdateNVPTXMetadata( "grid_constant annotations for fn: " << NF->getName() << "\n"); // The 'value' operand is a list of integers denoting parameter indices - auto *OldGridConstParamIdxs = - cast(MetadataNode->getOperand(I + 1)); + const auto *OldGridConstParamIdxs = + dyn_cast(MetadataNode->getOperand(I + 1)); + assert(OldGridConstParamIdxs && + "Unexpected NVVM annotation format: expected MDNode operand"); // For each parameter that's identified as a grid_constant, count how // many arguments before that position are dead, and shift the number // down by that amount.