From 67f336b86306ff8d99629ceb5e52414f8a7815fc Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 9 Jul 2024 17:43:39 +0100 Subject: [PATCH 1/2] [SYCL][NVPTX] Emit reqd_work_group_size attributes as NVVM annotations Only emit the provided values as annotations in the LLVM IR. The NVPTX backend will pad missing values with 1s. This suits the fact that the attribute must provide as many values as the dimensionality of the work-group, and we can assume that the work-group size of unused dimensions is 1. --- clang/lib/CodeGen/Targets/NVPTX.cpp | 23 ++++++++ .../test/CodeGenSYCL/reqd-work-group-size.cpp | 53 ++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index d8964749dc483..75b54b749b38c 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -261,6 +261,29 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal()); } + if (const auto *MWGS = FD->getAttr()) { + llvm::SmallVector, 3> Ops; + // Index-flip and pad out any missing elements. Note the misleading + // nomenclature of the methods: getXDimVal doesn't return the X dimension; + // it returns the left-most dimension (dim0). This could correspond to + // CUDA's X, Y, or Z, depending on the number of operands provided. + if (auto Dim0 = MWGS->getXDimVal()) + Ops.push_back(Dim0->getExtValue()); + if (auto Dim1 = MWGS->getYDimVal()) + Ops.push_back(Dim1->getExtValue()); + if (auto Dim2 = MWGS->getZDimVal()) + Ops.push_back(Dim2->getExtValue()); + std::reverse(Ops.begin(), Ops.end()); + Ops.append(3 - Ops.size(), std::nullopt); + + if (auto X = Ops[0]) + addNVVMMetadata(F, "reqntidx", *X); + if (auto Y = Ops[1]) + addNVVMMetadata(F, "reqntidy", *Y); + if (auto Z = Ops[2]) + addNVVMMetadata(F, "reqntidz", *Z); + } + auto attrValue = [&](Expr *E) { const auto *CE = cast(E); std::optional Val = CE->getResultAsAPSInt(); diff --git a/clang/test/CodeGenSYCL/reqd-work-group-size.cpp b/clang/test/CodeGenSYCL/reqd-work-group-size.cpp index 542655a94ac3a..ddb28c3fac1d5 100644 --- a/clang/test/CodeGenSYCL/reqd-work-group-size.cpp +++ b/clang/test/CodeGenSYCL/reqd-work-group-size.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX #include "sycl.hpp" @@ -123,6 +123,55 @@ int main() { // CHECK: define {{.*}} void @{{.*}}kernel_name22() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D22:[0-9]+]] // CHECK: define {{.*}} void @{{.*}}kernel_name24() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D2:[0-9]+]] +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidx", i32 16} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidy", i32 16} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidz", i32 32} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidx", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidy", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidz", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidx", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidy", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidz", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidx", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidy", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidz", i32 1} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidx", i32 16} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidy", i32 16} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidz", i32 32} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidx", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidy", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidz", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidx", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidy", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidz", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidx", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidy", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidz", i32 1} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidx", i32 16} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidy", i32 32} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name13, !"reqntidz" +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidx", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidy", i32 8} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name15, !"reqntidz" +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidx", i32 2} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidy", i32 2} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name16, !"reqntidz" +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidx", i32 8} +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidy", i32 1} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name18, !"reqntidz" +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name19, !"reqntidx", i32 32} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidy", +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidz", +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name21, !"reqntidx", i32 8} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidy", +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidz", +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name22, !"reqntidx", i32 2} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidy", +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidz", +// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name24, !"reqntidx", i32 1} +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidy", +// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidz", + // CHECK: ![[NDRWGS3D]] = !{i32 3} // CHECK: ![[WGSIZE3D32]] = !{i32 16, i32 16, i32 32} // CHECK: ![[WGSIZE3D88]] = !{i32 8, i32 8, i32 8} From 22ff4c866d07d08098e13e4fac2cb89a59f06356 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 10 Jul 2024 12:33:02 +0100 Subject: [PATCH 2/2] fix excessively large work-group sizes --- clang/lib/CodeGen/Targets/NVPTX.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 75b54b749b38c..0321978c26620 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -261,27 +261,35 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal()); } - if (const auto *MWGS = FD->getAttr()) { + if (const auto *RWGS = FD->getAttr()) { llvm::SmallVector, 3> Ops; // Index-flip and pad out any missing elements. Note the misleading // nomenclature of the methods: getXDimVal doesn't return the X dimension; // it returns the left-most dimension (dim0). This could correspond to // CUDA's X, Y, or Z, depending on the number of operands provided. - if (auto Dim0 = MWGS->getXDimVal()) + if (auto Dim0 = RWGS->getXDimVal()) Ops.push_back(Dim0->getExtValue()); - if (auto Dim1 = MWGS->getYDimVal()) + if (auto Dim1 = RWGS->getYDimVal()) Ops.push_back(Dim1->getExtValue()); - if (auto Dim2 = MWGS->getZDimVal()) + if (auto Dim2 = RWGS->getZDimVal()) Ops.push_back(Dim2->getExtValue()); std::reverse(Ops.begin(), Ops.end()); Ops.append(3 - Ops.size(), std::nullopt); - if (auto X = Ops[0]) - addNVVMMetadata(F, "reqntidx", *X); - if (auto Y = Ops[1]) - addNVVMMetadata(F, "reqntidy", *Y); - if (auto Z = Ops[2]) - addNVVMMetadata(F, "reqntidz", *Z); + // Work-group sizes (in NVVM annotations) must be positive and less than + // INT32_MAX, whereas SYCL can allow for larger work-group sizes (see + // -fno-sycl-id-queries-fit-in-int). If any dimension is too large for + // NVPTX, don't emit any annotation at all. + if (llvm::all_of(Ops, [](std::optional V) { + return !V || llvm::isUInt<31>(*V); + })) { + if (auto X = Ops[0]) + addNVVMMetadata(F, "reqntidx", *X); + if (auto Y = Ops[1]) + addNVVMMetadata(F, "reqntidy", *Y); + if (auto Z = Ops[2]) + addNVVMMetadata(F, "reqntidz", *Z); + } } auto attrValue = [&](Expr *E) {