diff --git a/SYCL/AtomicRef/add-native.cpp b/SYCL/AtomicRef/add-native.cpp
deleted file mode 100644
index dcd5329e86..0000000000
--- a/SYCL/AtomicRef/add-native.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
-// RUN: -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda-sycldevice --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// TODO: Remove items from UNSUPPORTED once corresponding backends support
-// "native" implementation
-// UNSUPPORTED: cpu
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <numeric>
-#include <vector>
-using namespace sycl;
-using namespace sycl::ONEAPI;
-
-template <typename T, typename Difference = T>
-void add_fetch_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
-        out[gid] = atm.fetch_add(Difference(1));
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Fetch returns original value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void add_plus_equal_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
-        out[gid] = atm += Difference(1);
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // += returns updated value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void add_pre_inc_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
-        out[gid] = ++atm;
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Pre-increment returns updated value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void add_post_inc_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
-        out[gid] = atm++;
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Post-increment returns original value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void add_test(queue q, size_t N) {
-  add_fetch_test<T, Difference>(q, N);
-  add_plus_equal_test<T, Difference>(q, N);
-  add_pre_inc_test<T, Difference>(q, N);
-  add_post_inc_test<T, Difference>(q, N);
-}
-
-// Floating-point types do not support pre- or post-increment
-template <> void add_test<float>(queue q, size_t N) {
-  add_fetch_test<float>(q, N);
-  add_plus_equal_test<float>(q, N);
-}
-template <> void add_test<double>(queue q, size_t N) {
-  add_fetch_test<double>(q, N);
-  add_plus_equal_test<double>(q, N);
-}
-
-int main() {
-  queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-
-  constexpr int N = 32;
-  add_test<int>(q, N);
-  add_test<unsigned int>(q, N);
-  add_test<long>(q, N);
-  add_test<unsigned long>(q, N);
-  add_test<long long>(q, N);
-  add_test<unsigned long long>(q, N);
-  add_test<float>(q, N);
-  add_test<double>(q, N);
-  add_test<char *, ptrdiff_t>(q, N);
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/add-emulated.cpp b/SYCL/AtomicRef/add.cpp
similarity index 98%
rename from SYCL/AtomicRef/add-emulated.cpp
rename to SYCL/AtomicRef/add.cpp
index c76b7ef99d..a947a0d450 100644
--- a/SYCL/AtomicRef/add-emulated.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
+// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda-sycldevice --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/AtomicRef/max-native.cpp b/SYCL/AtomicRef/max-native.cpp
deleted file mode 100644
index 42f3f83404..0000000000
--- a/SYCL/AtomicRef/max-native.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
-// RUN: -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// TODO: Remove items from UNSUPPORTED once corresponding backends support
-// "native" implementation
-// UNSUPPORTED: cpu, cuda
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <numeric>
-#include <vector>
-using namespace sycl;
-using namespace sycl::ONEAPI;
-
-template <typename T> void max_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::lowest();
-  T val = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), std::numeric_limits<T>::max());
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-
-        // +1 accounts for lowest() returning 0 for unsigned types
-        out[gid] = atm.fetch_max(T(gid) + 1);
-      });
-    });
-  }
-
-  // Final value should be equal to N
-  assert(val == N);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_max returns original value
-  // Intermediate values should all be >= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] >= initial);
-  }
-}
-
-int main() {
-  queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-
-  constexpr int N = 32;
-  max_test<int>(q, N);
-  max_test<unsigned int>(q, N);
-  max_test<long>(q, N);
-  max_test<unsigned long>(q, N);
-  max_test<long long>(q, N);
-  max_test<unsigned long long>(q, N);
-  max_test<float>(q, N);
-  max_test<double>(q, N);
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/max-emulated.cpp b/SYCL/AtomicRef/max.cpp
similarity index 100%
rename from SYCL/AtomicRef/max-emulated.cpp
rename to SYCL/AtomicRef/max.cpp
index 44e100387c..db7735e68d 100644
--- a/SYCL/AtomicRef/max-emulated.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/SYCL/AtomicRef/min-native.cpp b/SYCL/AtomicRef/min-native.cpp
deleted file mode 100644
index eac73e7332..0000000000
--- a/SYCL/AtomicRef/min-native.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
-// RUN: -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// TODO: Remove items from UNSUPPORTED once corresponding backends support
-// "native" implementation
-// UNSUPPORTED: cpu, cuda
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <numeric>
-#include <vector>
-using namespace sycl;
-using namespace sycl::ONEAPI;
-
-template <typename T> void min_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::max();
-  T val = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), 0);
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = atm.fetch_min(T(gid));
-      });
-    });
-  }
-
-  // Final value should be equal to 0
-  assert(val == 0);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_min returns original value
-  // Intermediate values should all be <= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] <= initial);
-  }
-}
-
-int main() {
-  queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-
-  constexpr int N = 32;
-  min_test<int>(q, N);
-  min_test<unsigned int>(q, N);
-  min_test<long>(q, N);
-  min_test<unsigned long>(q, N);
-  min_test<long long>(q, N);
-  min_test<unsigned long long>(q, N);
-  min_test<float>(q, N);
-  min_test<double>(q, N);
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/min-emulated.cpp b/SYCL/AtomicRef/min.cpp
similarity index 100%
rename from SYCL/AtomicRef/min-emulated.cpp
rename to SYCL/AtomicRef/min.cpp
index ca94aa5a7f..324fdcca9f 100644
--- a/SYCL/AtomicRef/min-emulated.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/SYCL/AtomicRef/sub-native.cpp b/SYCL/AtomicRef/sub-native.cpp
deleted file mode 100644
index 5a8c709517..0000000000
--- a/SYCL/AtomicRef/sub-native.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
-// RUN: -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda-sycldevice --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// TODO: Remove items from UNSUPPORTED once corresponding backends support
-// "native" implementation
-// UNSUPPORTED: cpu
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <numeric>
-#include <vector>
-using namespace sycl;
-using namespace sycl::ONEAPI;
-
-template <typename T, typename Difference = T>
-void sub_fetch_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = atm.fetch_sub(Difference(1));
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Fetch returns original value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void sub_plus_equal_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = atm -= Difference(1);
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // -= returns updated value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void sub_pre_dec_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = --atm;
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Pre-decrement returns updated value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void sub_post_dec_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = atm--;
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Post-decrement returns original value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void sub_test(queue q, size_t N) {
-  sub_fetch_test<T, Difference>(q, N);
-  sub_plus_equal_test<T, Difference>(q, N);
-  sub_pre_dec_test<T, Difference>(q, N);
-  sub_post_dec_test<T, Difference>(q, N);
-}
-
-// Floating-point types do not support pre- or post-decrement
-template <> void sub_test<float>(queue q, size_t N) {
-  sub_fetch_test<float>(q, N);
-  sub_plus_equal_test<float>(q, N);
-}
-template <> void sub_test<double>(queue q, size_t N) {
-  sub_fetch_test<double>(q, N);
-  sub_plus_equal_test<double>(q, N);
-}
-
-int main() {
-  queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-
-  constexpr int N = 32;
-  sub_test<int>(q, N);
-  sub_test<unsigned int>(q, N);
-  sub_test<long>(q, N);
-  sub_test<unsigned long>(q, N);
-  sub_test<long long>(q, N);
-  sub_test<unsigned long long>(q, N);
-  sub_test<float>(q, N);
-  sub_test<double>(q, N);
-  sub_test<char *, ptrdiff_t>(q, N);
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/sub-emulated.cpp b/SYCL/AtomicRef/sub.cpp
similarity index 98%
rename from SYCL/AtomicRef/sub-emulated.cpp
rename to SYCL/AtomicRef/sub.cpp
index 04e5059f7f..013197f264 100644
--- a/SYCL/AtomicRef/sub-emulated.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
+// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda-sycldevice --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out