From b7a6883a8d483712a2f2ab33e39052ffe9f497ae Mon Sep 17 00:00:00 2001 From: Renaud-K Date: Tue, 17 Dec 2024 17:43:50 -0800 Subject: [PATCH 1/2] Using nvvm intrinsics for the syncthread and threadfence families of calls --- .../flang/Optimizer/Builder/IntrinsicCall.h | 7 ++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 85 +++++++++++++++++++ flang/module/cudadevice.f90 | 14 +-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 8 +- 4 files changed, 103 insertions(+), 11 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index bc0020e614db2..c16c717779a61 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -392,6 +392,10 @@ struct IntrinsicLibrary { fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef); void genSignalSubroutine(llvm::ArrayRef); void genSleep(llvm::ArrayRef); + void genSyncThreads(llvm::ArrayRef); + mlir::Value genSyncThreadsAnd(mlir::Type,llvm::ArrayRef); + mlir::Value genSyncThreadsCount(mlir::Type,llvm::ArrayRef); + mlir::Value genSyncThreadsOr(mlir::Type,llvm::ArrayRef); fir::ExtendedValue genSystem(std::optional, mlir::ArrayRef args); void genSystemClock(llvm::ArrayRef); @@ -401,6 +405,9 @@ struct IntrinsicLibrary { llvm::ArrayRef); fir::ExtendedValue genTranspose(mlir::Type, llvm::ArrayRef); + void genThreadFence(llvm::ArrayRef); + void genThreadFenceBlock(llvm::ArrayRef); + void genThreadFenceSystem(llvm::ArrayRef); fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 547cebefd2df4..fe449d95c1605 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -642,6 +642,10 @@ static constexpr IntrinsicHandler handlers[]{ {"dim", asValue}, {"mask", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, + {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false}, + {"syncthreads_and", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, + {"syncthreads_count", &I::genSyncThreadsCount, {}, /*isElemental=*/false}, + {"syncthreads_or", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, {"system", &I::genSystem, {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}}, @@ -660,6 +664,9 @@ static constexpr IntrinsicHandler handlers[]{ &I::genTranspose, {{{"matrix", asAddr}}}, /*isElemental=*/false}, + {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, + {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, + {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"trim", &I::genTrim, {{{"string", asAddr}}}, /*isElemental=*/false}, {"ubound", &I::genUbound, @@ -7290,6 +7297,52 @@ IntrinsicLibrary::genSum(mlir::Type resultType, resultType, args); } +// SYNCTHREADS +void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector noArgs; + builder.create(loc, funcOp, noArgs); +} + +// SYNCTHREADS_AND +mlir::Value +IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; + mlir::MLIRContext *context = builder.getContext(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {args[0].getType()}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + return builder.create(loc, funcOp, args).getResult(0); +} + +// SYNCTHREADS_COUNT +mlir::Value +IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; + mlir::MLIRContext *context = builder.getContext(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {args[0].getType()}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + return builder.create(loc, funcOp, args).getResult(0); +} + +// SYNCTHREADS_OR +mlir::Value +IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; + mlir::MLIRContext *context = builder.getContext(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {args[0].getType()}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + return builder.create(loc, funcOp, args).getResult(0); +} + // SYSTEM fir::ExtendedValue IntrinsicLibrary::genSystem(std::optional resultType, @@ -7420,6 +7473,38 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE"); } +// THREADFENCE +void IntrinsicLibrary::genThreadFence(llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector noArgs; + builder.create(loc, funcOp, noArgs); +} + +// THREADFENCE_BLOCK +void IntrinsicLibrary::genThreadFenceBlock( + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector noArgs; + builder.create(loc, funcOp, noArgs); +} + +// THREADFENCE_SYSTEM +void IntrinsicLibrary::genThreadFenceSystem( + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector noArgs; + builder.create(loc, funcOp, noArgs); +} + // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 0224ecfdde7c6..1402bd4e15041 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -18,27 +18,27 @@ module cudadevice ! Synchronization Functions interface - attributes(device) subroutine syncthreads() bind(c, name='__syncthreads') + attributes(device) subroutine syncthreads() end subroutine end interface public :: syncthreads interface - attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and') + attributes(device) integer function syncthreads_and(value) integer :: value end function end interface public :: syncthreads_and interface - attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count') + attributes(device) integer function syncthreads_count(value) integer :: value end function end interface public :: syncthreads_count interface - attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or') + attributes(device) integer function syncthreads_or(value) integer :: value end function end interface @@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp') ! Memory Fences interface - attributes(device) subroutine threadfence() bind(c, name='__threadfence') + attributes(device) subroutine threadfence() end subroutine end interface public :: threadfence interface - attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block') + attributes(device) subroutine threadfence_block() end subroutine end interface public :: threadfence_block interface - attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system') + attributes(device) subroutine threadfence_system() end subroutine end interface public :: threadfence_system diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 1331b644130c8..1951fb1d43c81 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -17,14 +17,14 @@ attributes(global) subroutine devsub() end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} -! CHECK: fir.call @__syncthreads() +! CHECK: fir.call @llvm.nvvm.barrier0() fastmath : () -> () ! CHECK: fir.call @__syncwarp(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> () ! CHECK: fir.call @__threadfence() ! CHECK: fir.call @__threadfence_block() ! CHECK: fir.call @__threadfence_system() -! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 -! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 -! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath : (i32) -> i32 +! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath : (i32) -> i32 +! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath : (i32) -> i32 ! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads", fir.proc_attrs = #fir.proc_attrs} ! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs} From 61cea52e13ad730edd0098965574b865084690c5 Mon Sep 17 00:00:00 2001 From: Renaud-K Date: Wed, 18 Dec 2024 09:03:40 -0800 Subject: [PATCH 2/2] Ordering function names. Applying formatting. Updating test --- .../flang/Optimizer/Builder/IntrinsicCall.h | 6 +++--- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 6 +++--- flang/test/Lower/CUDA/cuda-device-proc.cuf | 20 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index c16c717779a61..6899505eeb39d 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -393,9 +393,9 @@ struct IntrinsicLibrary { void genSignalSubroutine(llvm::ArrayRef); void genSleep(llvm::ArrayRef); void genSyncThreads(llvm::ArrayRef); - mlir::Value genSyncThreadsAnd(mlir::Type,llvm::ArrayRef); - mlir::Value genSyncThreadsCount(mlir::Type,llvm::ArrayRef); - mlir::Value genSyncThreadsOr(mlir::Type,llvm::ArrayRef); + mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef); + mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef); + mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genSystem(std::optional, mlir::ArrayRef args); void genSystemClock(llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index fe449d95c1605..542ced62d39fe 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -655,6 +655,9 @@ static constexpr IntrinsicHandler handlers[]{ {{{"count", asAddr}, {"count_rate", asAddr}, {"count_max", asAddr}}}, /*isElemental=*/false}, {"tand", &I::genTand}, + {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, + {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, + {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -664,9 +667,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genTranspose, {{{"matrix", asAddr}}}, /*isElemental=*/false}, - {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, - {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, - {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"trim", &I::genTrim, {{{"string", asAddr}}}, /*isElemental=*/false}, {"ubound", &I::genUbound, diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 1951fb1d43c81..2042bbbe19650 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -19,18 +19,18 @@ end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} ! CHECK: fir.call @llvm.nvvm.barrier0() fastmath : () -> () ! CHECK: fir.call @__syncwarp(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> () -! CHECK: fir.call @__threadfence() -! CHECK: fir.call @__threadfence_block() -! CHECK: fir.call @__threadfence_system() +! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath : () -> () +! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath : () -> () +! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath : () -> () ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath : (i32) -> i32 -! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads", fir.proc_attrs = #fir.proc_attrs} +! CHECK: func.func private @llvm.nvvm.barrier0() ! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_block", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_system", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__syncthreads_and(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_and", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__syncthreads_count(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_count", fir.proc_attrs = #fir.proc_attrs} -! CHECK: func.func private @__syncthreads_or(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_or", fir.proc_attrs = #fir.proc_attrs} +! CHECK: func.func private @llvm.nvvm.membar.gl() +! CHECK: func.func private @llvm.nvvm.membar.cta() +! CHECK: func.func private @llvm.nvvm.membar.sys() +! CHECK: func.func private @llvm.nvvm.barrier0.and(i32) -> i32 +! CHECK: func.func private @llvm.nvvm.barrier0.popc(i32) -> i32 +! CHECK: func.func private @llvm.nvvm.barrier0.or(i32) -> i32