From ff116fc87f23ab5703413a1a9bf6c9943067187a Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Mon, 11 Nov 2019 20:10:04 +0300 Subject: [PATCH] [SYCL] Improve error reporting for kernel enqueue The idea is to handle error code returned from PI and based on input arguments, emit meaningful exception with error explanation. One of the side-effects of this is that this patch effectively allows to use non-uniform work-groups if underlying OpenCL supports this functionality. Signed-off-by: Alexey Sachkov --- sycl/include/CL/sycl/detail/cg.hpp | 13 +- sycl/include/CL/sycl/detail/pi.h | 7 +- sycl/source/CMakeLists.txt | 1 + .../detail/error_handling/enqueue_kernel.cpp | 183 ++++++ .../detail/error_handling/error_handling.hpp | 32 + sycl/source/detail/scheduler/commands.cpp | 12 +- sycl/test/basic_tests/parallel_for_range.cpp | 596 ++++++++++++++++-- .../basic_tests/parallel_for_range_host.cpp | 98 +++ 8 files changed, 869 insertions(+), 73 deletions(-) create mode 100644 sycl/source/detail/error_handling/enqueue_kernel.cpp create mode 100644 sycl/source/detail/error_handling/error_handling.hpp create mode 100644 sycl/test/basic_tests/parallel_for_range_host.cpp diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 8155af774f8c5..60c641a2d944c 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -237,7 +237,7 @@ class HostKernel : public HostKernelBase { for (int I = 0; I < Dims; ++I) { if (NDRDesc.LocalSize[I] == 0 || NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0) - throw sycl::runtime_error("Invalid local size for global size"); + throw sycl::nd_range_error("Invalid local size for global size"); GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } @@ -278,7 +278,7 @@ class HostKernel : public HostKernelBase { for (int I = 0; I < Dims; ++I) { if (NDRDesc.LocalSize[I] == 0 || NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0) - throw sycl::runtime_error("Invalid local size for global size"); + throw sycl::nd_range_error("Invalid local size for global size"); NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } @@ -385,15 +385,6 @@ class CGExecKernel : public CG { MStreams(std::move(Streams)) { assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && "Wrong type of exec kernel CG."); - - if (MNDRDesc.LocalSize.size() > 0) { - range<3> Excess = (MNDRDesc.GlobalSize % MNDRDesc.LocalSize); - for (int I = 0; I < 3; I++) { - if (Excess[I] != 0) - throw nd_range_error("Global size is not a multiple of local size", - CL_INVALID_WORK_GROUP_SIZE); - } - } } std::vector getArguments() const { return MArgs; } diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 6358967a49a74..951cffe2688b4 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -62,7 +62,8 @@ typedef enum { PI_INVALID_DEVICE = CL_INVALID_DEVICE, PI_INVALID_BINARY = CL_INVALID_BINARY, PI_MISALIGNED_SUB_BUFFER_OFFSET = CL_MISALIGNED_SUB_BUFFER_OFFSET, - PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY + PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY, + PI_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE } _pi_result; typedef enum { @@ -88,7 +89,9 @@ typedef enum { PI_DEVICE_INFO_PARENT = CL_DEVICE_PARENT_DEVICE, PI_DEVICE_INFO_PLATFORM = CL_DEVICE_PLATFORM, PI_DEVICE_INFO_PARTITION_TYPE = CL_DEVICE_PARTITION_TYPE, - PI_DEVICE_INFO_NAME = CL_DEVICE_NAME + PI_DEVICE_INFO_NAME = CL_DEVICE_NAME, + PI_DEVICE_VERSION = CL_DEVICE_VERSION, + PI_DEVICE_MAX_WORK_GROUP_SIZE = CL_DEVICE_MAX_WORK_GROUP_SIZE } _pi_device_info; // TODO: populate diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dfd5bac3df8ad..d79f2d6797fa8 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -46,6 +46,7 @@ set(SYCL_SOURCES "detail/context_impl.cpp" "detail/device_impl.cpp" "detail/device_info.cpp" + "detail/error_handling/enqueue_kernel.cpp" "detail/event_impl.cpp" "detail/force_device.cpp" "detail/helpers.cpp" diff --git a/sycl/source/detail/error_handling/enqueue_kernel.cpp b/sycl/source/detail/error_handling/enqueue_kernel.cpp new file mode 100644 index 0000000000000..4c4e8c4b8217c --- /dev/null +++ b/sycl/source/detail/error_handling/enqueue_kernel.cpp @@ -0,0 +1,183 @@ +//===------------------- enqueue_kernel.cpp ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// SYCL error handling of enqueue kernel operations +// +//===----------------------------------------------------------------------===// + +#include "error_handling.hpp" + +#include + +namespace cl { +namespace sycl { +namespace detail { + +namespace enqueue_kernel_launch { + +bool handleInvalidWorkGroupSize(pi_device Device, pi_kernel Kernel, + const NDRDescT &NDRDesc) { + const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + + size_t VerSize = 0; + PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, 0, nullptr, &VerSize); + assert(VerSize >= 10 && + "Unexpected device version string"); // strlen("OpenCL X.Y") + string_class VerStr(VerSize, '\0'); + PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, VerSize, &VerStr.front(), + nullptr); + const char *Ver = &VerStr[7]; // strlen("OpenCL ") + + size_t CompileWGSize[3] = {0}; + PI_CALL(piKernelGetGroupInfo)(Kernel, Device, + CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + sizeof(size_t) * 3, CompileWGSize, nullptr); + + if (CompileWGSize[0] != 0) { + // OpenCL 1.x && 2.0: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the + // reqd_work_group_size attribute is used to declare the work-group size + // for kernel in the program source. + if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0'))) + throw sycl::nd_range_error( + "OpenCL 1.x and 2.0 requires to pass local size argument even if " + "required work-group size was specified in the program source", + PI_INVALID_WORK_GROUP_SIZE); + + // Any OpenCL version: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not + // match the required work-group size for kernel in the program source. + if (NDRDesc.LocalSize[0] != CompileWGSize[0] || + NDRDesc.LocalSize[1] != CompileWGSize[1] || + NDRDesc.LocalSize[2] != CompileWGSize[2]) + throw sycl::nd_range_error( + "Specified local size doesn't match the required work-group size " + "specified in the program source", + PI_INVALID_WORK_GROUP_SIZE); + } + + if (Ver[0] == '1') { + // OpenCL 1.x: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in + // table 4.3 + size_t MaxWGSize = 0; + PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &MaxWGSize, nullptr); + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + if (TotalNumberOfWIs > MaxWGSize) + throw sycl::nd_range_error( + "Total number of work-items in a work-group cannot exceed " + "info::device::max_work_group_size which is equal to " + + std::to_string(MaxWGSize), + PI_INVALID_WORK_GROUP_SIZE); + } else { + // OpenCL 2.x: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in table 5.21. + size_t KernelWGSize = 0; + PI_CALL(piKernelGetGroupInfo)(Kernel, Device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &KernelWGSize, nullptr); + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + if (TotalNumberOfWIs > KernelWGSize) + throw sycl::nd_range_error( + "Total number of work-items in a work-group cannot exceed " + "info::kernel_work_group::work_group_size which is equal to " + + std::to_string(KernelWGSize) + " for this kernel", + PI_INVALID_WORK_GROUP_SIZE); + } + + if (HasLocalSize) { + const bool NonUniformWGs = + (NDRDesc.LocalSize[0] != 0 && + NDRDesc.GlobalSize[0] % NDRDesc.LocalSize[0] != 0) || + (NDRDesc.LocalSize[1] != 0 && + NDRDesc.GlobalSize[1] % NDRDesc.LocalSize[1] != 0) || + (NDRDesc.LocalSize[2] != 0 && + NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0); + + if (Ver[0] == '1') { + // OpenCL 1.x: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and + // number of workitems specified by global_work_size is not evenly + // divisible by size of work-group given by local_work_size + + if (NonUniformWGs) + throw sycl::nd_range_error( + "Non-uniform work-groups are not supported by the target device", + PI_INVALID_WORK_GROUP_SIZE); + } else { + // OpenCL 2.x: + // CL_INVALID_WORK_GROUP_SIZE if the program was compiled with + // –cl-uniform-work-group-size and the number of work-items specified + // by global_work_size is not evenly divisible by size of work-group + // given by local_work_size + + pi_program Program = nullptr; + PI_CALL(piKernelGetInfo)(Kernel, CL_KERNEL_PROGRAM, sizeof(pi_program), + &Program, nullptr); + size_t OptsSize = 0; + PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS, + 0, nullptr, &OptsSize); + string_class Opts(OptsSize, '\0'); + PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS, + OptsSize, &Opts.front(), nullptr); + if (NonUniformWGs) { + const bool HasStd20 = Opts.find("-cl-std=CL2.0") != string_class::npos; + if (!HasStd20) + throw sycl::nd_range_error( + "Non-uniform work-groups are not allowed by default. Underlying " + "OpenCL 2.x implementation supports this feature and to enable " + "it, build device program with -cl-std=CL2.0", + PI_INVALID_WORK_GROUP_SIZE); + else + throw sycl::nd_range_error( + "Non-uniform work-groups are not allowed by default. Underlying " + "OpenCL 2.x implementation supports this feature, but it is " + "disabled by -cl-uniform-work-group-size build flag", + PI_INVALID_WORK_GROUP_SIZE); + } + } + } + + // TODO: required number of sub-groups, OpenCL 2.1: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not + // consistent with the required number of sub-groups for kernel in the + // program source. + + // Fallback + constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE; + throw runtime_error( + "OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error); +} + +bool handleError(pi_result Error, pi_device Device, pi_kernel Kernel, + const NDRDescT &NDRDesc) { + assert(Error != PI_SUCCESS && + "Success is expected to be handled on caller side"); + switch (Error) { + case PI_INVALID_WORK_GROUP_SIZE: + return handleInvalidWorkGroupSize(Device, Kernel, NDRDesc); + // TODO: Handle other error codes + default: + throw runtime_error( + "OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error); + } +} + +} // namespace enqueue_kernel_launch + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/source/detail/error_handling/error_handling.hpp b/sycl/source/detail/error_handling/error_handling.hpp new file mode 100644 index 0000000000000..5bdf0cb90ffa6 --- /dev/null +++ b/sycl/source/detail/error_handling/error_handling.hpp @@ -0,0 +1,32 @@ +//===-------- error_handling.hpp - SYCL error handling ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cl { +namespace sycl { +namespace detail { + +namespace enqueue_kernel_launch { +/// Analyzes error code and arguments of piEnqueueKernelLaunch to emit +/// user-friendly exception describing the problem. +/// +/// This function is expected to be called only for non-success error codes, +/// i.e. the first argument must not be equal to PI_SUCCESS. +/// +/// This function actually never returns and always throws an exception with +/// error description. +bool handleError(pi_result, pi_device, pi_kernel, const NDRDescT &); +} // namespace enqueue_kernel_launch + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 5caccb13bbd89..3ad9ee86cbc9c 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +#include + #include "CL/sycl/access/access.hpp" #include #include @@ -940,11 +942,19 @@ cl_int ExecCGCommand::enqueueImp() { ReverseRangeDimensionsForKernel(NDRDesc); - PI_CALL(piEnqueueKernelLaunch)( + pi_result Error = PI_CALL_NOCHECK(piEnqueueKernelLaunch)( MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + if (PI_SUCCESS != Error) { + // If we have got non-success error code, let's analyze it to emit nice + // exception explaining what was wrong + pi_device Device = + detail::getSyclObjImpl(MQueue->get_device())->getHandleRef(); + return detail::enqueue_kernel_launch::handleError(Error, Device, Kernel, + NDRDesc); + } return PI_SUCCESS; } case CG::CGTYPE::COPY_USM: { diff --git a/sycl/test/basic_tests/parallel_for_range.cpp b/sycl/test/basic_tests/parallel_for_range.cpp index 5d93d00d08b7e..1172b8bc44fd1 100644 --- a/sycl/test/basic_tests/parallel_for_range.cpp +++ b/sycl/test/basic_tests/parallel_for_range.cpp @@ -1,93 +1,571 @@ // RUN: %clangxx -fsycl %s -o %t.out -// RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out #include +#include + using namespace cl::sycl; +[[cl::reqd_work_group_size(4, 4, 4)]] void reqd_wg_size_helper() { + // do nothing +} + int main() { - auto asyncHandler = [](exception_list es) { - for (auto& e : es) { - std::rethrow_exception(e); + auto AsyncHandler = [](exception_list ES) { + for (auto &E : ES) { + std::rethrow_exception(E); } }; - // parallel_for, 100 global, 3 local -> fail. + queue Q(AsyncHandler); + device D(Q.get_device()); + + string_class DeviceVendorName = D.get_info(); + auto DeviceType = D.get_info(); + + // parallel_for, (16, 16, 16) global, (8, 8, 8) local, reqd_wg_size(4, 4, 4) + // -> fail try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for(nd_range<1>(range<1>(100), range<1>(3)), - [=](nd_item<1> id) {}); + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(16, 16, 16), range<3>(8, 8, 8)), + [=](nd_item<3>) { reqd_wg_size_helper(); }); }); - q.wait_and_throw(); - assert(false && "Should have thrown exception"); - } catch (nd_range_error e) { - // We expect an error to be thrown! + Q.wait_and_throw(); + std::cerr << "Test case ReqdWGSizeNegativeA failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Specified local size doesn't match the required work-group size " + "specified in the program source") == string_class::npos) { + std::cerr + << "Test case ReqdWGSizeNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr << "Test case ReqdWGSizeNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizeNegativeA failed: something unexpected " + "has been caught" + << std::endl; + return 1; } - // parallel_for, 100 global, 4 local -> pass. - try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for(nd_range<1>(range<1>(100), range<1>(4)), - [=](nd_item<1> id) {}); - }); - q.wait_and_throw(); - } catch (nd_range_error e) { - assert(false && "Should not have thrown exception"); + string_class OCLVersionStr = D.get_info(); + assert(OCLVersionStr.size() >= 10 && + "Unexpected device version string"); // strlen("OpenCL X.Y") + const char *OCLVersion = &OCLVersionStr[7]; // strlen("OpenCL ") + if (OCLVersion[0] == '1' || (OCLVersion[0] == '2' && OCLVersion[2] == '0')) { + // parallel_for, (16, 16, 16) global, null local, reqd_wg_size(4, 4, 4) // + // -> fail + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + range<3>(16, 16, 16), [=](item<3>) { reqd_wg_size_helper(); }); + }); + Q.wait_and_throw(); + std::cerr + << "Test case ReqdWGSizeNegativeB failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "OpenCL 1.x and 2.0 requires to pass local size argument even if " + "required work-group size was specified in the program source") == + string_class::npos) { + std::cerr + << "Test case ReqdWGSizeNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case ReqdWGSizeNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizeNegativeB failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } } - // parallel_for, (100, 33, 16) global, (2, 3, 4) local -> pass. + // Positive test-cases that should pass on any underlying OpenCL runtime + + // parallel_for, (8, 8, 8) global, (4, 4, 4) local, reqd_wg_size(4, 4, 4) -> + // pass try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for(nd_range<3>(range<3>(100, 33, 16), - range<3>(2, 3, 4)), - [=](nd_item<3> id) {}); + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(8, 8, 8), range<3>(4, 4, 4)), + [=](nd_item<3>) { reqd_wg_size_helper(); }); }); - q.wait_and_throw(); - } catch (nd_range_error e) { - assert(false && "Should not have thrown exception"); + Q.wait_and_throw(); + } catch (nd_range_error &E) { + std::cerr << "Test case ReqdWGSizePositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (runtime_error &E) { + std::cerr << "Test case ReqdWGSizePositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizePositiveA failed: something unexpected " + "has been caught" + << std::endl; + return 1; } - // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> fail. - try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for(nd_range<3>(range<3>(100, 33, 16), - range<3>(2, 3, 5)), - [=](nd_item<3> id) {}); - }); - q.wait_and_throw(); - assert(false && "Should have thrown exception"); - } catch (nd_range_error e) { + if (OCLVersion[0] == '1') { + // OpenCL 1.x + + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and + // number of workitems specified by global_work_size is not evenly + // divisible by size of work-group given by local_work_size + try { + // parallel_for, 100 global, 3 local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<1>(range<1>(100), range<1>(3)), [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::accelerator != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL1XNegativeA failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find("Non-uniform work-groups are not " + "supported by the target device") == + string_class::npos) { + std::cerr + << "Test case OpenCL1XNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr << "Test case OpenCL1XNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL1XNegativeA failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + try { + // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(100, 33, 16), range<3>(2, 3, 5)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::accelerator != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL1XNegativeB failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find("Non-uniform work-groups are not " + "supported by the target device") == + string_class::npos) { + std::cerr + << "Test case OpenCL1XNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr << "Test case OpenCL1XNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL1XNegativeB failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in + // table 4.3 + size_t MaxDeviceWGSize = D.get_info(); + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<2>(range<2>(MaxDeviceWGSize, MaxDeviceWGSize), + range<2>(MaxDeviceWGSize, 2)), + [=](nd_item<2>) {}); + }); + Q.wait_and_throw(); + std::cerr << "Test case OpenCL1XNegativeC failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Total number of work-items in a work-group cannot exceed " + "info::device::max_work_group_size which is equal to " + + std::to_string(MaxDeviceWGSize)) == string_class::npos) { + std::cerr + << "Test case OpenCL1XNegativeC failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr << "Test case OpenCL1XNegativeC failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL1XNegativeC failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } else if (OCLVersion[0] == '2') { + // OpenCL 2.x + + // OpenCL 2.x: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in table 5.21. + { + program P(Q.get_context()); + P.build_with_kernel_type(); + + kernel K = P.get_kernel(); + size_t MaxKernelWGSize = + K.get_work_group_info( + Q.get_device()); + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + K, + nd_range<2>(range<2>(MaxKernelWGSize, MaxKernelWGSize), + range<2>(MaxKernelWGSize, 2)), + [=](nd_item<2>) {}); + }); + Q.wait_and_throw(); + std::cerr + << "Test case OpenCL2XNegativeA failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Total number of work-items in a work-group cannot exceed " + "info::kernel_work_group::work_group_size which is equal to " + + std::to_string(MaxKernelWGSize) + " for this kernel") == + string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XNegativeA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XNegativeA failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } + + // By default, program is built in OpenCL 1.2 mode, so the following error + // is expected even for OpenCL 2.x: + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and + // number of workitems specified by global_work_size is not evenly + // divisible by size of work-group given by local_work_size + { + try { + // parallel_for, 100 global, 3 local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<1>(range<1>(100), range<1>(3)), [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::cpu != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeB failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Non-uniform work-groups are not allowed by default. " + "Underlying OpenCL 2.x implementation supports this feature " + "and to enable it, build device program with -cl-std=CL2.0") == + string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XNegativeB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XNegativeB failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + try { + // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(100, 33, 16), range<3>(2, 3, 5)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::cpu != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeC failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Non-uniform work-groups are not allowed by default. " + "Underlying OpenCL 2.x implementation supports this feature " + "and to enable it, build device program with -cl-std=CL2.0") == + string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeC failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XNegativeC failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XNegativeC failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } + + // Enable non-uniform work-groups by passing -cl-std=CL2.0 + { + program P(Q.get_context()); + P.build_with_kernel_type("-cl-std=CL2.0"); + + kernel K = P.get_kernel(); + try { + // parallel_for, 100 global, 3 local -> pass + Q.submit([&](handler &CGH) { + CGH.parallel_for( + K, nd_range<1>(range<1>(100), range<1>(3)), [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + } catch (nd_range_error &E) { + std::cerr + << "Test case OpenCL2XPositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XPositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XPositiveA failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } + + { + program P(Q.get_context()); + P.build_with_kernel_type("-cl-std=CL2.0"); + + kernel K = P.get_kernel(); + try { + // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> pass + Q.submit([&](handler &CGH) { + CGH.parallel_for( + K, nd_range<3>(range<3>(100, 33, 16), range<3>(2, 3, 5)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + } catch (nd_range_error &E) { + std::cerr + << "Test case OpenCL2XPositiveB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XPositiveB failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XPositiveB failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } + + // Enable 2.0 mode with non-uniform work-groups, but disable the latter by + // specifying -cl-uniform-work-group-size: + // CL_INVALID_WORK_GROUP_SIZE if the program was compiled with + // –cl-uniform-work-group-size and the number of work-items specified + // by global_work_size is not evenly divisible by size of work-group + // given by local_work_size + { + program P(Q.get_context()); + P.build_with_kernel_type( + "-cl-std=CL2.0 -cl-uniform-work-group-size"); + + kernel K = P.get_kernel(); + try { + // parallel_for, 100 global, 3 local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + K, nd_range<1>(range<1>(100), range<1>(3)), [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::cpu != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeD failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Non-uniform work-groups are not allowed by default. " + "Underlying OpenCL 2.x implementation supports this feature, " + "but it is disabled by -cl-uniform-work-group-size build " + "flag") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeD failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XNegativeD failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XNegativeD failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } + + { + program P(Q.get_context()); + P.build_with_kernel_type( + "-cl-std=CL2.0 -cl-uniform-work-group-size"); + + kernel K = P.get_kernel(); + try { + // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> fail + Q.submit([&](handler &CGH) { + CGH.parallel_for( + K, nd_range<3>(range<3>(100, 33, 16), range<3>(2, 3, 5)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + // FIXME: some Intel runtimes contain bug and don't return expected + // error code + if (info::device_type::cpu != DeviceType || + DeviceVendorName.find("Intel") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeE failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Non-uniform work-groups are not allowed by default. " + "Underlying OpenCL 2.x implementation supports this feature, " + "but it is disabled by -cl-uniform-work-group-size build " + "flag") == string_class::npos) { + std::cerr + << "Test case OpenCL2XNegativeE failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr + << "Test case OpenCL2XNegativeE failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case OpenCL2XNegativeE failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + } } - // local size has a 0-based range -- no SIGFPEs, we hope. + // local size has a 0-based range -- no SIGFPEs, we hope try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for(nd_range<2>(range<2>(5, 33), range<2>(1, 0)), - [=](nd_item<2> id) {}); + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<2>(range<2>(5, 33), range<2>(1, 0)), [=](nd_item<2>) {}); }); - q.wait_and_throw(); - assert(false && "Should have thrown exception"); - } catch (runtime_error e) { + Q.wait_and_throw(); + std::cerr << "Test case NegativeA failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (runtime_error) { } - // parallel_for_work_group with 0-based local range. + // parallel_for_work_group with 0-based local range try { - queue q(asyncHandler); - q.submit([&](handler &cgh) { - cgh.parallel_for_work_group(range<2>(5, 33), range<2>(1, 0), - [=](group<2> g) {}); + Q.submit([&](handler &CGH) { + CGH.parallel_for_work_group( + range<2>(5, 33), range<2>(1, 0), [=](group<2>) {}); }); - q.wait_and_throw(); - assert(false && "Should have thrown exception"); - } catch (runtime_error e) { + Q.wait_and_throw(); + std::cerr << "Test case NegativeB failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (runtime_error) { } + return 0; } diff --git a/sycl/test/basic_tests/parallel_for_range_host.cpp b/sycl/test/basic_tests/parallel_for_range_host.cpp new file mode 100644 index 0000000000000..012e0479b4b71 --- /dev/null +++ b/sycl/test/basic_tests/parallel_for_range_host.cpp @@ -0,0 +1,98 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out + +#include + +#include + +using namespace cl::sycl; + +int main() { + auto AsyncHandler = [](exception_list ES) { + for (auto& E : ES) { + std::rethrow_exception(E); + } + }; + + queue Q(AsyncHandler); + + // parallel_for, 100 global, 3 local -> fail. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for(nd_range<1>(range<1>(100), range<1>(3)), + [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + std::cerr << "Test case 'a' failed: no exception has been thrown" + << std::endl; + return 1; + } catch (nd_range_error) { + // We expect an error to be thrown! + } + + // parallel_for, 100 global, 4 local -> pass. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for(nd_range<1>(range<1>(100), range<1>(4)), + [=](nd_item<1>) {}); + }); + Q.wait_and_throw(); + } catch (nd_range_error) { + std::cerr << "Test case 'b' failed: exception has been thrown" << std::endl; + return 1; + } + + // parallel_for, (100, 33, 16) global, (2, 3, 4) local -> pass. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for(nd_range<3>(range<3>(100, 33, 16), + range<3>(2, 3, 4)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + } catch (nd_range_error) { + std::cerr << "Test case 'c' failed: exception has been thrown" << std::endl; + return 1; + } + + // parallel_for, (100, 33, 16) global, (2, 3, 5) local -> fail. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for(nd_range<3>(range<3>(100, 33, 16), + range<3>(2, 3, 5)), + [=](nd_item<3>) {}); + }); + Q.wait_and_throw(); + std::cerr << "Test case 'd' failed: no exception has been thrown" + << std::endl; + return 1; + } catch (nd_range_error) { + } + + // local size has a 0-based range -- no SIGFPEs, we hope. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for(nd_range<2>(range<2>(5, 33), range<2>(1, 0)), + [=](nd_item<2>) {}); + }); + Q.wait_and_throw(); + std::cerr << "Test case 'e' failed: no exception has been thrown" + << std::endl; + return 1; + } catch (nd_range_error) { + } + + // parallel_for_work_group with 0-based local range. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for_work_group(range<2>(5, 33), range<2>(1, 0), + [=](group<2>) {}); + }); + Q.wait_and_throw(); + std::cerr << "Test case 'f' failed: no exception has been thrown" + << std::endl; + return 1; + } catch (nd_range_error) { + } + return 0; +}