diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index 54e8d8533bef8..b83a8a26eeae0 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and finalize the fused kernel to the final target. More information is available [here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). +### Interaction with `parallel_for` range rounding + +DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is +transparent for fusion, meaning the generated wrapper kernel with the rounded up +range will be used. + +[Private internalization](#internalization-behavior) is supported when fusing +such kernels. We use the original, unrounded global size in dimension 0 when +computing the private memory size. As range rounding only applies to basic +kernels (parametrized by a `sycl::range`), local internalization is not affected +by the range rounding transformation. + ### Unsupported SYCL constructs The following SYCL API constructs are currently not officially supported for diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index a053c57db0c34..d7f3626c2bd62 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -203,10 +203,17 @@ static Promotion getInternalizationInfo(Requirement *Req) { return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion; } -static std::optional getLocalSize(NDRDescT NDRange, Requirement *Req, - Promotion Target) { +static std::optional getLocalSize(NDRDescT NDRange, + std::optional UserGlobalSize, + Requirement *Req, Promotion Target) { + assert((!UserGlobalSize.has_value() || Target != Promotion::Local) && + "Unexpected range rounding"); auto NumElementsMem = static_cast(Req->MSYCLMemObj)->size(); if (Target == Promotion::Private) { + if (UserGlobalSize.has_value()) { + // Only the first dimension is affected by range rounding. + NDRange.GlobalSize[0] = *UserGlobalSize; + } auto NumWorkItems = NDRange.GlobalSize.size(); // For private internalization, the local size is // (Number of elements in buffer)/(number of work-items) @@ -237,13 +244,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) { static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex, unsigned ArgFunctionIndex, NDRDescT NDRange, + std::optional UserGlobalSize, PromotionMap &Promotions) { assert(Arg.MType == kernel_param_kind_t::kind_accessor); Requirement *Req = static_cast(Arg.MPtr); auto ThisPromotionTarget = getInternalizationInfo(Req); - auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget); + auto ThisLocalSize = + getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget); if (Promotions.count(Req->MSYCLMemObj)) { // We previously encountered an accessor for the same buffer. @@ -278,7 +287,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex, // Recompute the local size for the previous definition with adapted // promotion target. auto NewPrevLocalSize = - getLocalSize(PreviousDefinition.NDRange, + getLocalSize(PreviousDefinition.NDRange, std::nullopt, PreviousDefinition.Definition, Promotion::Local); if (!NewPrevLocalSize.has_value()) { @@ -316,7 +325,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex, if (PreviousDefinition.PromotionTarget == Promotion::Local) { // Recompute the local size with adapted promotion target. - auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local); + auto ThisLocalSize = + getLocalSize(NDRange, std::nullopt, Req, Promotion::Local); if (!ThisLocalSize.has_value()) { printPerformanceWarning("Work-group size for local promotion not " "specified, not performing internalization"); @@ -591,11 +601,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, // argument is later on passed to the kernel. const size_t SizeAccField = sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims); - // Compute the local size and use it for the range parameters. - auto LocalSize = getLocalSize(NDRange, Req, - (PromotedToPrivate) ? Promotion::Private - : Promotion::Local); - range<3> AccessRange{1, 1, LocalSize.value()}; + // Compute the local size and use it for the range parameters (only + // relevant for local promotion). + size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt, + Req, Promotion::Local) + : 0; + range<3> AccessRange{1, 1, LocalSize}; auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange); // Use all-zero as the offset id<3> AcessOffset{0, 0, 0}; @@ -604,7 +615,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, // Override the arguments. // 1. Override the pointer with a std-layout argument with 'nullptr' as // value. handler.cpp does the same for local accessors. - int SizeInBytes = Req->MElemSize * LocalSize.value(); + int SizeInBytes = Req->MElemSize * LocalSize; FusedArgs[ArgIndex] = ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes, static_cast(ArgIndex)}; @@ -694,6 +705,26 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, return A.MIndex < B.MIndex; }); + // Determine whether the kernel has been subject to DPCPP's range rounding. + // If so, the first argument will be the original ("user") range. + std::optional UserGlobalSize; + if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 || + KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) && + !Args.empty()) { + auto &A0 = Args[0]; + auto Dims = KernelCG->MNDRDesc.Dims; + assert(A0.MPtr && A0.MSize == static_cast(Dims * sizeof(size_t)) && + A0.MType == kernel_param_kind_t::kind_std_layout && + "Unexpected signature for rounded range kernel"); + + size_t *UGS = reinterpret_cast(A0.MPtr); + // Range-rounding only applies to the first dimension. + assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]); + assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]); + assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]); + UserGlobalSize = UGS[0]; + } + ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()}; size_t ArgIndex = 0; // The kernel function in SPIR-V will only have the non-eliminated @@ -719,7 +750,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, if (!Eliminated) { if (Arg.MType == kernel_param_kind_t::kind_accessor) { resolveInternalization(Arg, KernelIndex, ArgFunctionIndex, - KernelCG->MNDRDesc, PromotedAccs); + KernelCG->MNDRDesc, UserGlobalSize, + PromotedAccs); } FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true); ++ArgFunctionIndex; diff --git a/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp b/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp index f912b2f0cc85d..52d07f99e7f8e 100644 --- a/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp +++ b/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp @@ -1,5 +1,6 @@ // RUN: %{build} -o %t.out -// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s +// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \ +// RUN: %{run} %t.out 2>&1 | FileCheck %s // Test complete fusion of kernels with different ND-ranges. @@ -262,4 +263,12 @@ int main() { // 1-D, 2-D and 3-D kernels with different global sizes. test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}}, RangeDesc{{10, 1, 1}, {5, 1, 1}}}); + + // Test global sizes that trigger the rounded range kernel insertion. + // Note that we lower the RR threshold when running this test. + test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}}); + + // Test multi-dimensional range-rounded kernels. Only the first dimension will + // be rounded up. + test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}}); } diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp index 5a1eb99ed0013..9bb6213a47ee1 100644 --- a/sycl/test-e2e/KernelFusion/private_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/private_internalization.cpp @@ -1,5 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out -// RUN: %{run} %t.out +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out // Test complete fusion with private internalization specified on the // accessors. @@ -8,8 +8,9 @@ using namespace sycl; -int main() { - constexpr size_t dataSize = 512; +template class KernelName; + +template static void test() { int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; for (size_t i = 0; i < dataSize; ++i) { @@ -39,7 +40,7 @@ int main() { auto accIn2 = bIn2.get_access(cgh); auto accTmp = bTmp.get_access( cgh, sycl::ext::codeplay::experimental::property::promote_private{}); - cgh.parallel_for( + cgh.parallel_for>( dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); }); @@ -48,7 +49,7 @@ int main() { cgh, sycl::ext::codeplay::experimental::property::promote_private{}); auto accIn3 = bIn3.get_access(cgh); auto accOut = bOut.get_access(cgh); - cgh.parallel_for( + cgh.parallel_for>( dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); }); @@ -63,6 +64,15 @@ int main() { assert(out[i] == (20 * i * i) && "Computation error"); assert(tmp[i] == -1 && "Not internalized"); } +} + +int main() { + // Test power-of-two size. + test<512>(); + + // Test prime size large enough to trigger rounded-range kernel insertion. + // Note that we lower the RR threshold when running this test. + test<523>(); return 0; } diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp index 7efe0f472032e..5894491c460cc 100644 --- a/sycl/test-e2e/KernelFusion/two_dimensional.cpp +++ b/sycl/test-e2e/KernelFusion/two_dimensional.cpp @@ -1,5 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out -// RUN: %{run} %t.out +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out // Test complete fusion with private internalization specified on the // accessors for two-dimensional range. @@ -8,9 +8,9 @@ using namespace sycl; -int main() { - constexpr size_t sizeX = 16; - constexpr size_t sizeY = 32; +template class KernelName; + +template static void test() { constexpr size_t dataSize = sizeX * sizeY; int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; @@ -42,7 +42,7 @@ int main() { auto accIn2 = bIn2.get_access(cgh); auto accTmp = bTmp.get_access( cgh, sycl::ext::codeplay::experimental::property::promote_private{}); - cgh.parallel_for( + cgh.parallel_for>( xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); }); @@ -51,7 +51,7 @@ int main() { cgh, sycl::ext::codeplay::experimental::property::promote_private{}); auto accIn3 = bIn3.get_access(cgh); auto accOut = bOut.get_access(cgh); - cgh.parallel_for( + cgh.parallel_for>( xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; }); }); @@ -66,6 +66,15 @@ int main() { assert(out[i] == (20 * i * i) && "Computation error"); assert(tmp[i] == -1 && "Not internalized"); } +} + +int main() { + // Test power-of-two size. + test<16, 32>(); + + // Test prime sizes large enough to trigger rounded-range kernel insertion. + // Note that we lower the RR threshold when running this test. + test<67, 79>(); return 0; }