diff --git a/dpctl-capi/CMakeLists.txt b/dpctl-capi/CMakeLists.txt index 77891d6ee7..60156f721a 100644 --- a/dpctl-capi/CMakeLists.txt +++ b/dpctl-capi/CMakeLists.txt @@ -86,6 +86,7 @@ elseif(UNIX) "-Wuninitialized " "-Wmissing-declarations " "-fdiagnostics-color=auto " + "-Wno-deprecated-declarations " ) string(CONCAT SDL_FLAGS "-fstack-protector " diff --git a/dpctl-capi/dbg_build.sh b/dpctl-capi/dbg_build.sh index 06055fad84..958a4f8ee3 100755 --- a/dpctl-capi/dbg_build.sh +++ b/dpctl-capi/dbg_build.sh @@ -13,7 +13,7 @@ cmake \ -DCMAKE_CXX_COMPILER=dpcpp \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} \ - -DDPCTL_ENABLE_LO_PROGRAM_CREATION=${USE_LO_HEADERS} \ + -DDPCTL_ENABLE_LO_PROGRAM_CREATION=ON \ -DDPCTL_BUILD_CAPI_TESTS=ON \ -DDPCTL_GENERATE_COVERAGE=ON \ .. diff --git a/dpctl-capi/tests/CMakeLists.txt b/dpctl-capi/tests/CMakeLists.txt index 9ac78a2c19..1000634794 100644 --- a/dpctl-capi/tests/CMakeLists.txt +++ b/dpctl-capi/tests/CMakeLists.txt @@ -35,11 +35,12 @@ if(DPCTL_GENERATE_COVERAGE) list(REMOVE_ITEM dpctl_sources "${CMAKE_CURRENT_SOURCE_DIR}/../source/dpctl_vector_templ.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../source/dpcpp_kernels.cpp" ) # Add profiling flags set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping" + "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping -DDPCTL_COVERAGE" ) # Add all dpctl sources into a single executable so that we can run coverage @@ -89,6 +90,10 @@ if(DPCTL_GENERATE_COVERAGE) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) else() + add_library(dpcpp_kernels + STATIC + ${CMAKE_CURRENT_SOURCE_DIR}/dpcpp_kernels.cpp + ) file(GLOB_RECURSE sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) add_executable(dpctl_c_api_tests EXCLUDE_FROM_ALL ${sources}) target_link_libraries(dpctl_c_api_tests @@ -96,6 +101,7 @@ else() GTest::GTest DPCTLSyclInterface ${LEVEL_ZERO_LIBRARY} + dpcpp_kernels ) endif() diff --git a/dpctl-capi/tests/dpcpp_kernels.cpp b/dpctl-capi/tests/dpcpp_kernels.cpp new file mode 100644 index 0000000000..897aa86de4 --- /dev/null +++ b/dpctl-capi/tests/dpcpp_kernels.cpp @@ -0,0 +1,89 @@ +#include "dpcpp_kernels.hpp" +#include +#include + +template sycl::kernel +dpcpp_kernels::get_fill_kernel(sycl::queue &, size_t, int *, int); + +template sycl::kernel +dpcpp_kernels::get_fill_kernel(sycl::queue &, + size_t, + unsigned int *, + unsigned int); + +template sycl::kernel +dpcpp_kernels::get_fill_kernel(sycl::queue &, size_t, double *, double); + +template sycl::kernel +dpcpp_kernels::get_fill_kernel(sycl::queue &, size_t, float *, float); + +template sycl::kernel +dpcpp_kernels::get_range_kernel(sycl::queue &, size_t, int *); + +template sycl::kernel +dpcpp_kernels::get_range_kernel(sycl::queue &, + size_t, + unsigned int *); + +template sycl::kernel +dpcpp_kernels::get_range_kernel(sycl::queue &, size_t, float *); + +template sycl::kernel +dpcpp_kernels::get_range_kernel(sycl::queue &, size_t, double *); + +template sycl::kernel dpcpp_kernels::get_mad_kernel(sycl::queue &, + size_t, + int *, + int *, + int *, + int); + +template sycl::kernel +dpcpp_kernels::get_mad_kernel(sycl::queue &, + size_t, + unsigned int *, + unsigned int *, + unsigned int *, + unsigned int); + +template sycl::kernel dpcpp_kernels::get_local_sort_kernel(sycl::queue &, + size_t, + size_t, + int *, + size_t); + +template sycl::kernel +dpcpp_kernels::get_local_count_exceedance_kernel(sycl::queue &, + size_t, + size_t, + int *, + size_t, + int, + int *); + +template sycl::kernel +dpcpp_kernels::get_local_count_exceedance_kernel(sycl::queue &, + size_t, + size_t, + unsigned int *, + size_t, + unsigned int, + int *); + +template sycl::kernel +dpcpp_kernels::get_local_count_exceedance_kernel(sycl::queue &, + size_t, + size_t, + float *, + size_t, + float, + int *); + +template sycl::kernel +dpcpp_kernels::get_local_count_exceedance_kernel(sycl::queue &, + size_t, + size_t, + double *, + size_t, + double, + int *); diff --git a/dpctl-capi/tests/dpcpp_kernels.hpp b/dpctl-capi/tests/dpcpp_kernels.hpp new file mode 100644 index 0000000000..099ea1a2da --- /dev/null +++ b/dpctl-capi/tests/dpcpp_kernels.hpp @@ -0,0 +1,299 @@ +#pragma once +#include + +namespace dpcpp_kernels +{ + +namespace +{ +template class populate_a; + +template class populate_b; + +template class mad_kern; + +template +auto make_cgh_function(int n, kernelFunc func) +{ + auto Kernel = [&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>(n), func); + }; + return Kernel; +}; + +template struct MadFunc +{ + const Ty *in1, *in2; + Ty *out; + scT val; + MadFunc(const Ty *a, const Ty *b, Ty *c, scT d) + : in1(a), in2(b), out(c), val(d) + { + } + void operator()(sycl::id<1> myId) const + { + auto gid = myId[0]; + out[gid] = in1[gid] + val * in2[gid]; + return; + } +}; + +template struct FillFunc +{ + T *out; + T val; + FillFunc(T *a, T val) : out(a), val(val) {} + void operator()(sycl::id<1> myId) const + { + auto gid = myId[0]; + out[gid] = val; + return; + }; +}; + +template struct RangeFunc +{ + T *out; + RangeFunc(T *b) : out(b) {} + void operator()(sycl::id<1> myId) const + { + auto gid = myId[0]; + out[gid] = T(gid); + return; + }; +}; + +} // namespace + +template +sycl::kernel get_fill_kernel(sycl::queue &q, size_t n, T *out, T fill_val) +{ + // out[i] = fill_val + sycl::program program(q.get_context()); + + [[maybe_unused]] auto cgh_fn = + make_cgh_function>(n, FillFunc(out, fill_val)); + + program.build_with_kernel_type>(); + return program.get_kernel>(); +}; + +template +sycl::kernel get_range_kernel(sycl::queue &q, size_t n, T *b) +{ + // b[i] = i + sycl::program program(q.get_context()); + + [[maybe_unused]] auto cgh_fn = + make_cgh_function>(n, RangeFunc(b)); + + program.build_with_kernel_type>(); + return program.get_kernel>(); +}; + +template +sycl::kernel +get_mad_kernel(sycl::queue &q, size_t n, T *in1, T *in2, T *out, scT val) +{ + // c[i] = a[i] + b[i] * val + sycl::program program(q.get_context()); + + [[maybe_unused]] auto cgh_fn = make_cgh_function>( + n, MadFunc(in1, in2, out, val)); + + program.build_with_kernel_type>(); + return program.get_kernel>(); +}; + +template +auto make_cgh_nd_function_with_local_memory(const sycl::nd_range<1> &nd_range, + size_t slm_size, + KernelFuncArgs kern_params) +{ + auto Kernel = [&](sycl::handler &cgh) { + localAccessorT lm(slm_size, cgh); + cgh.parallel_for(nd_range, KernelFunctor(kern_params, lm)); + }; + return Kernel; +}; + +template +auto make_cgh_nd_function(const sycl::nd_range<1> &nd_range, KernelFunctor kern) +{ + auto Kernel = [&](sycl::handler &cgh) { + cgh.parallel_for(nd_range, kern); + }; + return Kernel; +}; + +template struct LocalSortArgs +{ + T *arr; + size_t global_array_size; + size_t wg_chunk_size; + LocalSortArgs(T *arr, size_t arr_len, size_t wg_len) + : arr(arr), global_array_size(arr_len), wg_chunk_size(wg_len) + { + } + ~LocalSortArgs() {} + + T *get_array_pointer() const + { + return arr; + } + size_t get_array_size() const + { + return global_array_size; + } + size_t get_chunk_size() const + { + return wg_chunk_size; + } +}; + +template struct LocalSortFunc +{ + /* + + */ + T *arr; + size_t global_array_size; + size_t wg_chunk_size; + localAccessorT lm; + LocalSortFunc(T *arr, size_t arr_len, size_t wg_len, localAccessorT lm) + : arr(arr), global_array_size(arr_len), wg_chunk_size(wg_len), lm(lm) + { + } + template + LocalSortFunc(paramsT params, localAccessorT lm) + : arr(params.get_array_pointer()), + global_array_size(params.get_array_size()), + wg_chunk_size(params.get_chunk_size()), lm(lm) + { + } + ~LocalSortFunc() {} + void operator()(sycl::nd_item<1> item) const + { + /* Use odd-even merge sort to sort lws chunk of array */ + size_t group_id = item.get_group_linear_id(); + size_t chunk_size = + sycl::min((group_id + 1) * wg_chunk_size, global_array_size) - + group_id * wg_chunk_size; + + // compute the greatest power of 2 less than chunk_size + size_t sp2 = 1; + while (sp2 < chunk_size) { + sp2 <<= 1; + } + sp2 >>= 1; + + size_t gid = item.get_global_linear_id(); + size_t lid = item.get_local_linear_id(); + + if (gid < global_array_size) { + lm[lid] = arr[gid]; + } + item.barrier(sycl::access::fence_space::local_space); + + for (size_t p = sp2; p > 0; p >>= 1) { + size_t q = sp2; + size_t r = 0; + for (size_t d = p; d > 0; d = q - p, q >>= 1, r = p) { + if ((lid < chunk_size - d) && (lid & p) == r) { + size_t i = lid; + size_t j = i + d; + T v1 = lm[i]; + T v2 = lm[j]; + if (v1 > v2) { + lm[i] = v2; + lm[j] = v1; + } + } + item.barrier(sycl::access::fence_space::local_space); + } + } + if (gid < global_array_size) { + arr[gid] = lm[lid]; + } + }; +}; + +template class local_sort_kern; + +template +sycl::kernel get_local_sort_kernel(sycl::queue &q, + size_t gws, + size_t lws, + T *arr, + size_t arr_len) +{ + sycl::program program(q.get_context()); + + using local_accessor_t = + sycl::accessor; + + [[maybe_unused]] auto cgh_fn = make_cgh_nd_function_with_local_memory< + local_sort_kern, local_accessor_t, LocalSortArgs, + LocalSortFunc>( + sycl::nd_range<1>(gws, lws), lws, LocalSortArgs(arr, arr_len, lws)); + + program.build_with_kernel_type>(); + return program.get_kernel>(); +}; + +template struct LocalCountExceedanceFunc +{ + T *arr; + size_t arr_len; + T threshold_val; + int *count_arr; + LocalCountExceedanceFunc(T *arr, + size_t arr_len, + T threshold_val, + int *count_arr) + : arr(arr), arr_len(arr_len), threshold_val(threshold_val), + count_arr(count_arr) + { + } + + void operator()(sycl::nd_item<1> item) const + { + /* count number of array elements in group chunk that + exceeds the threshold value */ + size_t gid = item.get_global_linear_id(); + int partial_sum = sycl::ONEAPI::reduce( + item.get_group(), + (gid < arr_len) ? int(arr[gid] > threshold_val) : int(0), + std::plus()); + count_arr[item.get_group_linear_id()] = partial_sum; + } +}; + +template class local_exceedance_kern; + +template +sycl::kernel get_local_count_exceedance_kernel(sycl::queue &q, + size_t gws, + size_t lws, + T *arr, + size_t arr_len, + T threshold_val, + int *counts) +{ + sycl::program program(q.get_context()); + + [[maybe_unused]] auto cgh_fn = + make_cgh_nd_function, + LocalCountExceedanceFunc>( + sycl::nd_range<1>(gws, lws), + LocalCountExceedanceFunc(arr, arr_len, threshold_val, counts)); + + program.build_with_kernel_type>(); + return program.get_kernel>(); +}; + +} // namespace dpcpp_kernels diff --git a/dpctl-capi/tests/test_sycl_queue_submit.cpp b/dpctl-capi/tests/test_sycl_queue_submit.cpp index c94089aeef..ca9fb3f233 100644 --- a/dpctl-capi/tests/test_sycl_queue_submit.cpp +++ b/dpctl-capi/tests/test_sycl_queue_submit.cpp @@ -24,6 +24,9 @@ //===----------------------------------------------------------------------===// #include "Support/CBindingWrapping.h" +#ifndef DPCTL_COVERAGE +#include "dpcpp_kernels.hpp" +#endif #include "dpctl_sycl_context_interface.h" #include "dpctl_sycl_device_interface.h" #include "dpctl_sycl_device_selector_interface.h" @@ -123,3 +126,234 @@ TEST_F(TestQueueSubmit, CheckSubmitRange_saxpy) DPCTLDevice_Delete(DRef); DPCTLDeviceSelector_Delete(DSRef); } + +#ifndef DPCTL_COVERAGE +namespace +{ + +template +bool common_submit_range_fn(sycl::queue &q, size_t n, scT val) +{ + T *a = sycl::malloc_device(n, q); + T *b = sycl::malloc_device(n, q); + T *c = sycl::malloc_device(n, q); + T fill_val = 1; + size_t Range[] = {n}; + + auto popA_kernel = dpcpp_kernels::get_fill_kernel(q, n, a, fill_val); + auto popB_kernel = dpcpp_kernels::get_range_kernel(q, n, b); + auto mad_kernel = dpcpp_kernels::get_mad_kernel(q, n, a, b, c, val); + + DPCTLSyclKernelRef popAKernRef = + reinterpret_cast(&popA_kernel); + DPCTLSyclKernelRef popBKernRef = + reinterpret_cast(&popB_kernel); + DPCTLSyclKernelRef madKernRef = + reinterpret_cast(&mad_kernel); + + DPCTLSyclQueueRef QRef = reinterpret_cast(&q); + void *popAArgs[] = {reinterpret_cast(a), + reinterpret_cast(&fill_val)}; + DPCTLKernelArgType popAKernelArgTypes[] = {DPCTL_VOID_PTR, katT}; + + DPCTLSyclEventRef popAERef = + DPCTLQueue_SubmitRange(popAKernRef, QRef, popAArgs, popAKernelArgTypes, + 2, Range, 1, nullptr, 0); + + void *popBArgs[] = {reinterpret_cast(b)}; + DPCTLKernelArgType popBKernelArgTypes[] = {DPCTL_VOID_PTR}; + + DPCTLSyclEventRef popBERef = + DPCTLQueue_SubmitRange(popBKernRef, QRef, popBArgs, popBKernelArgTypes, + 1, Range, 1, nullptr, 0); + + void *madArgs[] = {reinterpret_cast(a), reinterpret_cast(b), + reinterpret_cast(c), + reinterpret_cast(&val)}; + DPCTLKernelArgType madKernelArgTypes[] = {DPCTL_VOID_PTR, DPCTL_VOID_PTR, + DPCTL_VOID_PTR, katscT}; + + DPCTLSyclEventRef deps[2] = {popAERef, popBERef}; + DPCTLSyclEventRef madRef = DPCTLQueue_SubmitRange( + madKernRef, QRef, madArgs, madKernelArgTypes, 4, Range, 1, deps, 2); + + DPCTLQueue_Wait(QRef); + DPCTLEvent_Delete(madRef); + DPCTLEvent_Delete(popBERef); + DPCTLEvent_Delete(popAERef); + + bool worked = true; + T *host_data = new T[n]; + q.memcpy(host_data, c, n * sizeof(T)).wait(); + for (size_t i = 0; i < n; ++i) { + worked = worked && (host_data[i] == T(fill_val) + T(i) * T(val)); + } + delete[] host_data; + + sycl::free(c, q); + sycl::free(b, q); + sycl::free(a, q); + + return worked; +}; + +template +bool common_submit_ndrange_fn(sycl::queue &q, size_t n) +{ + size_t lws = 64; + size_t n_groups = (n + lws - 1) / lws; + size_t gws = n_groups * lws; + + T *a = sycl::malloc_device(n, q); + int *counts = sycl::malloc_device(n_groups, q); + size_t Range[] = {n}; + size_t gRange[] = {gws}; + size_t lRange[] = {lws}; + + auto popA_kernel = dpcpp_kernels::get_range_kernel(q, n, a); + T threshold_val = T(n / 2); + + auto count_kernel = dpcpp_kernels::get_local_count_exceedance_kernel( + q, gws, lws, a, n, threshold_val, counts); + + DPCTLSyclKernelRef countKernRef = + reinterpret_cast(&count_kernel); + DPCTLSyclKernelRef popAKernRef = + reinterpret_cast(&popA_kernel); + + DPCTLSyclQueueRef QRef = reinterpret_cast(&q); + void *popAArgs[] = {reinterpret_cast(a)}; + DPCTLKernelArgType popAKernelArgTypes[] = {DPCTL_VOID_PTR}; + + DPCTLSyclEventRef popAERef = + DPCTLQueue_SubmitRange(popAKernRef, QRef, popAArgs, popAKernelArgTypes, + 1, Range, 1, nullptr, 0); + + void *countArgs[] = {reinterpret_cast(a), + reinterpret_cast(&n), + reinterpret_cast(&threshold_val), + reinterpret_cast(counts)}; + DPCTLKernelArgType countKernelArgTypes[] = {DPCTL_VOID_PTR, DPCTL_SIZE_T, + katT, DPCTL_VOID_PTR}; + DPCTLSyclEventRef deps[1] = {popAERef}; + + DPCTLSyclEventRef aggregERef = DPCTLQueue_SubmitNDRange( + countKernRef, QRef, countArgs, countKernelArgTypes, 4, gRange, lRange, + 1, deps, 1); + + DPCTLEvent_Wait(aggregERef); + DPCTLEvent_Delete(popAERef); + DPCTLEvent_Delete(aggregERef); + + bool worked = true; + T *host_a = new T[n]; + int *host_counts = new int[n_groups]; + q.memcpy(host_a, a, n * sizeof(T)); + q.memcpy(host_counts, counts, n_groups * sizeof(int)); + q.wait_and_throw(); + + sycl::free(a, q); + sycl::free(counts, q); + + for (size_t group_id = 0, gid = 0; group_id < n_groups; ++group_id) { + int count = 0; + for (size_t lid = 0; lid < lws; ++lid, ++gid) { + if (gid < n) { + count += int(host_a[gid] > threshold_val); + } + } + worked = worked && (count == host_counts[group_id]); + } + + delete[] host_counts; + delete[] host_a; + + return worked; +} + +} // namespace + +struct TestQueueSubmitRange : public ::testing::Test +{ + sycl::queue q; + size_t n_elems = 512; + + TestQueueSubmitRange() : q(sycl::default_selector{}) {} + ~TestQueueSubmitRange() {} +}; + +TEST_F(TestQueueSubmitRange, ChkSubmitRangeInt) +{ + bool worked = false; + worked = common_submit_range_fn(q, n_elems, + int(-1)); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitRange, ChkSubmitRangeUnsignedInt) +{ + bool worked = false; + worked = + common_submit_range_fn(q, n_elems, int(2)); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitRange, ChkSubmitRangeFloat) +{ + bool worked = false; + worked = common_submit_range_fn( + q, n_elems, float(0.5)); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitRange, ChkSubmitRangeDouble) +{ + bool worked = false; + worked = common_submit_range_fn( + q, n_elems, double(-0.5)); + EXPECT_TRUE(worked); +} + +struct TestQueueSubmitNDRange : public ::testing::Test +{ + sycl::queue q; + size_t n_elems = 512; + + TestQueueSubmitNDRange() : q(sycl::default_selector{}) {} + ~TestQueueSubmitNDRange() {} +}; + +TEST_F(TestQueueSubmitNDRange, ChkSubmitNDRangeInt) +{ + bool worked = false; + worked = common_submit_ndrange_fn(q, n_elems); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitNDRange, ChkSubmitNDRangeUnsignedInt) +{ + bool worked = false; + worked = + common_submit_ndrange_fn(q, n_elems); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitNDRange, ChkSubmitNDRangeFloat) +{ + bool worked = false; + worked = common_submit_ndrange_fn(q, n_elems); + EXPECT_TRUE(worked); +} + +TEST_F(TestQueueSubmitNDRange, ChkSubmitNDRangeDouble) +{ + bool worked = false; + worked = common_submit_ndrange_fn(q, n_elems); + EXPECT_TRUE(worked); +} + +#endif