diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp new file mode 100644 index 0000000000..1b09d96180 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp @@ -0,0 +1,374 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t13.out + +#include "support.h" +#include + +#include +#include +#include +#include + +namespace my_sycl = sycl::ext::oneapi; + +auto async_handler_ = [](sycl::exception_list ex_list) { + for (auto &ex : ex_list) { + try { + std::rethrow_exception(ex); + } catch (sycl::exception &ex) { + std::cerr << ex.what() << std::endl; + std::exit(EXIT_FAILURE); + } + } +}; + +constexpr uint32_t items_per_work_item = 4; + +struct CustomType { + int x; +}; + +struct CustomFunctor { + bool operator()(const CustomType &lhs, const CustomType &rhs) const { + return lhs.x < rhs.x; + } +}; + +// we need it since using std::abs leads to compilation error +template T my_abs(T x) { return x >= 0 ? x : -x; } + +template bool check(T lhs, T rhs, float epsilon) { + return my_abs(lhs - rhs) > epsilon; +} +bool check(CustomType lhs, CustomType rhs, float epsilon) { + return my_abs(lhs.x - rhs.x) > epsilon; +} + +template +bool verify(T *expected, T *got, std::size_t n, float epsilon) { + for (std::size_t i = 0; i < n; ++i) { + if (check(expected[i], got[i], epsilon)) { + return false; + } + } + return true; +} + +// forward declared classes to name kernels +template class sort_over_group_kernel_name; +template class joint_sort_kernel_name; +template class custom_sorter_kernel_name; + +// this class is needed to pass dimension value to aforementioned classes +template class int_wrapper; + +// custom sorter +template struct bubble_sorter { + Compare comp; + size_t idx; + + template + void operator()(Group g, Ptr begin, Ptr end) { + size_t n = end - begin; + if (idx == 0) + for (size_t i = 0; i < n; ++i) + for (size_t j = i + 1; j < n; ++j) + if (comp(begin[j], begin[i])) + std::swap(begin[i], begin[j]); + } +}; + +template sycl::range get_range(const std::size_t local); + +template <> sycl::range<1> get_range<1>(const std::size_t local) { + return sycl::range<1>(local); +} + +template <> sycl::range<2> get_range<2>(const std::size_t local) { + return sycl::range<2>(local, 1); +} + +template <> sycl::range<3> get_range<3>(const std::size_t local) { + return sycl::range<3>(local, 1, 1); +} + +template +int test_sort_over_group(sycl::queue &q, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + if (n > local) + return -1; + + sycl::range local_range = get_range(local); + + std::size_t local_memory_size = + my_sycl::experimental::default_sorter<>::memory_required( + sycl::memory_scope::work_group, local_range); + + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for, T, Compare>>( + sycl::nd_range(local_range, local_range), + [=](sycl::nd_item id) { + scratch[0] = std::byte{}; + auto local_id = id.get_local_linear_id(); + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + aI1[local_id] = my_sycl::sort_over_group( + my_sycl::experimental::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id]); + break; + case 1: + aI1[local_id] = my_sycl::sort_over_group( + my_sycl::experimental::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id], comp); + break; + case 2: + aI1[local_id] = my_sycl::sort_over_group( + id.get_group(), aI1[local_id], + my_sycl::experimental::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return 1; +} + +template +int test_joint_sort(sycl::queue &q, std::size_t n_items, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + auto n_groups = (n - 1) / n_items + 1; + + std::size_t local_memory_size = + my_sycl::experimental::default_sorter<>::memory_required( + sycl::memory_scope::work_group, n); + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for>( + sycl::nd_range<1>{{n_groups * local}, {local}}, + [=](sycl::nd_item<1> id) { + auto group_id = id.get_group(0); + auto ptr_keys = &aI1[group_id * n_items]; + // Replacing the line above with the line below also works + // auto ptr_keys = aI1.get_pointer() + group_id * n_items; + + scratch[0] = std::byte{}; + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + my_sycl::joint_sort( + my_sycl::experimental::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items)); + break; + case 1: + my_sycl::joint_sort( + my_sycl::experimental::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), comp); + break; + case 2: + my_sycl::joint_sort( + id.get_group(), ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), + my_sycl::experimental::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return n_groups; +} + +template +int test_custom_sorter(sycl::queue &q, sycl::buffer &bufI1, Compare comp) { + std::size_t local = 256; + auto n = bufI1.size(); + if (n > local) + return -1; + local = std::min(local, n); + + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + + h.parallel_for>( + sycl::nd_range<2>({local, 1}, {local, 1}), [=](sycl::nd_item<2> id) { + auto ptr = aI1.get_pointer(); + + my_sycl::joint_sort( + id.get_group(), ptr, ptr + n, + bubble_sorter{comp, id.get_local_linear_id()}); + }); + }).wait_and_throw(); + return 1; +} + +template +void run_sort(sycl::queue &q, std::vector &in, std::size_t size, + Compare comp, int test_case, int sort_case) { + std::vector in2(in.begin(), in.begin() + size); + std::vector expected(in.begin(), in.begin() + size); + std::size_t local = + q.get_device() + .template get_info(); + local = std::min(local, size); + auto n_items = items_per_work_item * local; + + int n_groups = 1; + { // scope to destruct buffers + sycl::buffer bufKeys(in2.data(), size); + { + switch (sort_case) { + case 0: + // this case is just to check the compilation + n_groups = test_sort_over_group<1>(q, local, bufKeys, comp, test_case); + + n_groups = test_sort_over_group<2>(q, local, bufKeys, comp, test_case); + break; + case 1: + n_groups = test_joint_sort(q, n_items, local, bufKeys, comp, test_case); + break; + case 2: + n_groups = test_custom_sorter(q, bufKeys, comp); + break; + } + } + } + + // check results + for (int i_group = 0; i_group < n_groups; ++i_group) { + std::sort(expected.begin() + i_group * n_items, + expected.begin() + std::min((i_group + 1) * n_items, size), comp); + } + if (n_groups != -1 && + (test_case != 0 || + test_case == 0 && std::is_same_v> && + !std::is_same_v)&&!verify(expected.data(), in2.data(), + size, 0.001f)) { + std::cerr << "Verification failed \n"; + exit(1); + } +} + +template struct test_sort_cases { + template + void operator()(sycl::queue &q, std::size_t dataSize, Compare comp, + Generator generate) { + std::vector stationaryData(dataSize); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = generate(i); + + // run test + for (int test_case = 0; test_case < 3; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, comp, test_case, sort_case); + } + } + } +}; + +void test_custom_type(sycl::queue &q, std::size_t dataSize) { + std::vector stationaryData(dataSize, CustomType{0}); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = CustomType{int(i)}; + + // run test + for (int test_case = 0; test_case < 1; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, CustomFunctor{}, test_case, + sort_case); + } + } +} + +template +void test_sort_by_comp(sycl::queue &q, std::size_t dataSize) { + std::default_random_engine generator; + std::normal_distribution distribution((10.0), (2.0)); + + T max_size = std::numeric_limits::max(); + std::size_t to_fill = dataSize; + if (dataSize > max_size) + to_fill = max_size; + + // reversed order + test_sort_cases()(q, to_fill, Compare{}, + [to_fill](std::size_t i) { return T(to_fill - i - 1); }); + // filled by 1 + test_sort_cases()(q, dataSize, Compare{}, + [](std::size_t) { return T(1); }); + // random distribution + test_sort_cases()(q, dataSize, Compare{}, + [&distribution, &generator](std::size_t) { + return T(distribution(generator)); + }); +} + +template +void test_sort_by_type(sycl::queue &q, std::size_t dataSize) { + test_sort_by_comp>(q, dataSize); + test_sort_by_comp>(q, dataSize); +} + +int main(int argc, char *argv[]) { + sycl::queue q(sycl::default_selector{}, async_handler_); + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + std::vector sizes{1, 2, 64, 256, 1024, 2048, 4096}; + + for (int i = 0; i < sizes.size(); ++i) { + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + + test_custom_type(q, sizes[i]); + } + std::cout << "Test passed." << std::endl; +}