diff --git a/.gitignore b/.gitignore index 55f356e..8fe35b5 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,7 @@ # Temporaries *~ *# -*/build \ No newline at end of file +*/build +] +# vim +*.swp diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp index 0e90beb..eb728aa 100644 --- a/examples/sgemm_interop/sycl_sgemm.cpp +++ b/examples/sgemm_interop/sycl_sgemm.cpp @@ -25,8 +25,7 @@ #include #include -#include -#include +#include #include #include @@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -void inline checkCudaErrorMsg(CUresult status, const char *msg) { - if (status != CUDA_SUCCESS) { - std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; - exit(EXIT_FAILURE); - } -} - -class CUDASelector : public sycl::device_selector { -public: - int operator()(const sycl::device &device) const override { - if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){ - std::cout << " CUDA device found " << std::endl; - return 1; - } else{ - return -1; - } - } -}; - int main() { using namespace sycl; @@ -88,7 +68,9 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{CUDASelector()}; + sycl::queue q{[](auto &d) { + return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); + }}; cublasHandle_t handle; CHECK_ERROR(cublasCreate(&handle)); @@ -104,12 +86,16 @@ int main() { auto d_C = b_C.get_access(h); h.host_task([=](sycl::interop_handle ih) { - cuCtxSetCurrent(ih.get_native_context()); + // Set the correct cuda context & stream + cuCtxSetCurrent(ih.get_native_context()); auto cuStream = ih.get_native_queue(); cublasSetStream(handle, cuStream); - auto cuA = reinterpret_cast(ih.get_native_mem(d_A)); - auto cuB = reinterpret_cast(ih.get_native_mem(d_B)); - auto cuC = reinterpret_cast(ih.get_native_mem(d_C)); + auto cuA = reinterpret_cast( + ih.get_native_mem(d_A)); + auto cuB = reinterpret_cast( + ih.get_native_mem(d_B)); + auto cuC = reinterpret_cast( + ih.get_native_mem(d_C)); CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA, diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp index 2468bf4..235bbd1 100644 --- a/examples/sgemm_interop/sycl_sgemm_usm.cpp +++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp @@ -25,8 +25,7 @@ #include #include -#include -#include +#include #include #include @@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -void inline checkCudaErrorMsg(CUresult status, const char *msg) { - if (status != CUDA_SUCCESS) { - std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; - exit(EXIT_FAILURE); - } -} - -class CUDASelector : public sycl::device_selector { -public: - int operator()(const sycl::device &device) const override { - if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){ - std::cout << " CUDA device found " << std::endl; - return 1; - } else{ - return -1; - } - } -}; - int main() { using namespace sycl; @@ -88,12 +68,14 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{CUDASelector()}; + sycl::queue q{[](auto &d) { + return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); + }}; // Allocate memory on the device - float* d_A = sycl::malloc_device(WIDTH*HEIGHT,q); - float* d_B = sycl::malloc_device(WIDTH*HEIGHT,q); - float* d_C = sycl::malloc_device(WIDTH*HEIGHT,q); + float *d_A = sycl::malloc_device(WIDTH * HEIGHT, q); + float *d_B = sycl::malloc_device(WIDTH * HEIGHT, q); + float *d_C = sycl::malloc_device(WIDTH * HEIGHT, q); // Copy matrices A & B to device from host vectors const size_t numBytes = WIDTH * HEIGHT * sizeof(float); @@ -105,21 +87,19 @@ int main() { CHECK_ERROR(cublasCreate(&handle)); q.submit([&](handler &h) { - - h.host_task([=](sycl::interop_handle ih) { - - // Set the correct cuda context & stream - cuCtxSetCurrent(ih.get_native_context()); - auto cuStream = ih.get_native_queue(); - cublasSetStream(handle, cuStream); - - // Call generalised matrix-matrix multiply - CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, - WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, - d_C, WIDTH)); - cuStreamSynchronize(cuStream); - }); - }).wait(); + h.host_task([=](sycl::interop_handle ih) { + // Set the correct cuda context & stream + cuCtxSetCurrent(ih.get_native_context()); + auto cuStream = ih.get_native_queue(); + cublasSetStream(handle, cuStream); + + // Call generalised matrix-matrix multiply + CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, + WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, + d_C, WIDTH)); + cuStreamSynchronize(cuStream); + }); + }).wait(); // Copy the result back to host q.memcpy(h_C.data(), d_C, numBytes).wait(); diff --git a/examples/vector_addition/README.md b/examples/vector_addition/README.md index b82357a..133fc51 100644 --- a/examples/vector_addition/README.md +++ b/examples/vector_addition/README.md @@ -7,9 +7,6 @@ to highlight how to build an application with SYCL for CUDA using DPC++ support, for which an example CMakefile is provided. For detailed documentation on how to migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers). -Note currently the CUDA backend does not support the [USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) extension, so we use -`sycl::buffer` and `sycl::accessors` instead. - Pre-requisites --------------- diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp index 9fdaf87..5e92d94 100644 --- a/examples/vector_addition/vector_addition.cpp +++ b/examples/vector_addition/vector_addition.cpp @@ -34,10 +34,9 @@ int main(int argc, char *argv[]) { // Initialize input data { - const auto dwrite_t = sycl::access::mode::discard_write; + sycl::host_accessor h_a{bufA, sycl::write_only}; + sycl::host_accessor h_b{bufB, sycl::write_only}; - auto h_a = bufA.get_access(); - auto h_b = bufB.get_access(); for (int i = 0; i < N; i++) { h_a[i] = sin(i) * sin(i); h_b[i] = cos(i) * cos(i); @@ -63,15 +62,14 @@ int main(int argc, char *argv[]) { auto b = bufB.get_access(h); auto c = bufC.get_access(h); - h.parallel_for(VecSize, - [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); + h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); }; myQueue.submit(cg); { - const auto read_t = sycl::access::mode::read; - auto h_c = bufC.get_access(); + sycl::host_accessor h_c{bufC, sycl::read_only}; + double sum = 0.0f; for (int i = 0; i < N; i++) { sum += h_c[i];