From e2bceafe7b90408c6a70190ced453fbf2e74c985 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Mon, 20 Jul 2020 17:09:06 -0400
Subject: [PATCH 01/13] Update namespaces to separate language/intel hw and
 adopt 2020 convention

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 .../include/CL/sycl/{ => ext}/intel/esimd.hpp |   8 +-
 .../intel/esimd/detail/esimd_host_util.hpp    |   0
 .../intel/esimd/detail/esimd_intrin.hpp       |  52 +-
 .../intel/esimd/detail/esimd_math_intrin.hpp  | 102 +--
 .../esimd/detail/esimd_memory_intrin.hpp      | 690 ++++++++++++++++++
 .../intel/esimd/detail/esimd_region.hpp       |   2 +
 .../intel/esimd/detail/esimd_types.hpp        |   6 +-
 .../intel/esimd/detail/esimd_util.hpp         |   2 +
 .../CL/sycl/{ => ext}/intel/esimd/esimd.hpp   |   6 +-
 .../sycl/{ => ext}/intel/esimd/esimd_enum.hpp |   2 +
 .../sycl/{ => ext}/intel/esimd/esimd_math.hpp |   0
 .../{ => ext}/intel/esimd/esimd_memory.hpp    |  12 +-
 .../sycl/{ => ext}/intel/esimd/esimd_view.hpp |   2 +
 .../{ => ext}/intel/fpga_device_selector.hpp  |   2 +
 .../sycl/{ => ext}/intel/fpga_extensions.hpp  |   6 +-
 .../CL/sycl/{ => ext}/intel/fpga_reg.hpp      |   2 +
 .../CL/sycl/{intel => ext/oneapi}/atomic.hpp  |   6 +-
 .../{intel => ext/oneapi}/atomic_enums.hpp    |  10 +-
 .../{intel => ext/oneapi}/atomic_fence.hpp    |   8 +-
 .../sycl/{intel => ext/oneapi}/atomic_ref.hpp |  14 +-
 .../sycl/{intel => ext/oneapi}/builtins.hpp   |  26 +-
 .../oneapi}/function_pointer.hpp              |   6 +-
 .../sycl/{intel => ext/oneapi}/functional.hpp |  24 +-
 .../{intel => ext/oneapi}/group_algorithm.hpp |  23 +-
 .../CL/sycl/{intel => ext/oneapi}/pipes.hpp   |   6 +-
 .../sycl/{intel => ext/oneapi}/reduction.hpp  |  60 +-
 .../oneapi}/spec_constant.hpp                 |   6 +-
 .../sycl/{intel => ext/oneapi}/sub_group.hpp  |  22 +-
 .../esimd/detail/esimd_memory_intrin.hpp      | 663 -----------------
 29 files changed, 922 insertions(+), 846 deletions(-)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd.hpp (78%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_host_util.hpp (100%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_intrin.hpp (83%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_math_intrin.hpp (90%)
 create mode 100644 sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_region.hpp (99%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_types.hpp (98%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/detail/esimd_util.hpp (99%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/esimd.hpp (99%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/esimd_enum.hpp (98%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/esimd_math.hpp (100%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/esimd_memory.hpp (98%)
 rename sycl/include/CL/sycl/{ => ext}/intel/esimd/esimd_view.hpp (99%)
 rename sycl/include/CL/sycl/{ => ext}/intel/fpga_device_selector.hpp (97%)
 rename sycl/include/CL/sycl/{ => ext}/intel/fpga_extensions.hpp (73%)
 rename sycl/include/CL/sycl/{ => ext}/intel/fpga_reg.hpp (96%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/atomic.hpp (73%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/atomic_enums.hpp (94%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/atomic_fence.hpp (89%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/atomic_ref.hpp (98%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/builtins.hpp (81%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/function_pointer.hpp (97%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/functional.hpp (83%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/group_algorithm.hpp (98%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/pipes.hpp (98%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/reduction.hpp (95%)
 rename sycl/include/CL/sycl/{experimental => ext/oneapi}/spec_constant.hpp (95%)
 rename sycl/include/CL/sycl/{intel => ext/oneapi}/sub_group.hpp (97%)
 delete mode 100644 sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp

diff --git a/sycl/include/CL/sycl/intel/esimd.hpp b/sycl/include/CL/sycl/ext/intel/esimd.hpp
similarity index 78%
rename from sycl/include/CL/sycl/intel/esimd.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd.hpp
index 7f4b7886d2d2c..3a1cffdcd2a68 100644
--- a/sycl/include/CL/sycl/intel/esimd.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd.hpp
@@ -10,10 +10,10 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/esimd.hpp>
-#include <CL/sycl/intel/esimd/esimd_math.hpp>
-#include <CL/sycl/intel/esimd/esimd_memory.hpp>
-#include <CL/sycl/intel/esimd/esimd_view.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_math.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_memory.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_view.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
 #define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd))
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_host_util.hpp
similarity index 100%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_host_util.hpp
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
similarity index 83%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
index 23674ac3d3e91..fdaca49bf6e30 100644
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
@@ -11,9 +11,9 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_util.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_util.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
 #include <cstdint>
 
 // \brief __esimd_rdregion: region access intrinsic.
@@ -60,8 +60,9 @@
 //
 template <typename T, int N, int M, int VStride, int Width, int Stride,
           int ParentWidth = 0>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
-__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<T, M>
+__esimd_rdregion(sycl::ext::intel::gpu::vector_type_t<T, N> Input,
+                 uint16_t Offset);
 
 // __esimd_wrregion returns the updated vector with the region updated.
 //
@@ -112,10 +113,11 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset);
 //
 template <typename T, int N, int M, int VStride, int Width, int Stride,
           int ParentWidth = 0>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
-__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
-                 sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
-                 sycl::intel::gpu::mask_type_t<M> Mask = 1);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<T, N>
+__esimd_wrregion(sycl::ext::intel::gpu::vector_type_t<T, N> OldVal,
+                 sycl::ext::intel::gpu::vector_type_t<T, M> NewVal,
+                 uint16_t Offset,
+                 sycl::ext::intel::gpu::mask_type_t<M> Mask = 1);
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -218,37 +220,41 @@ readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
 // optimization on simd object
 //
 template <typename T, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
-__esimd_vload(const sycl::intel::gpu::vector_type_t<T, N> *ptr);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<T, N>
+__esimd_vload(const sycl::ext::intel::gpu::vector_type_t<T, N> *ptr);
 
 // vstore
 //
 // map to the backend vstore intrinsic, used by compiler to control
 // optimization on simd object
 template <typename T, int N>
-SYCL_EXTERNAL void __esimd_vstore(sycl::intel::gpu::vector_type_t<T, N> *ptr,
-                                  sycl::intel::gpu::vector_type_t<T, N> vals);
+SYCL_EXTERNAL void
+__esimd_vstore(sycl::ext::intel::gpu::vector_type_t<T, N> *ptr,
+               sycl::ext::intel::gpu::vector_type_t<T, N> vals);
 
 template <typename T, int N>
-SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t<T, N> src);
+SYCL_EXTERNAL uint16_t
+__esimd_any(sycl::ext::intel::gpu::vector_type_t<T, N> src);
 
 template <typename T, int N>
-SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t<T, N> src);
+SYCL_EXTERNAL uint16_t
+__esimd_all(sycl::ext::intel::gpu::vector_type_t<T, N> src);
 
 #ifndef __SYCL_DEVICE_ONLY__
 
 // Implementations of ESIMD intrinsics for the SYCL host device
 template <typename T, int N, int M, int VStride, int Width, int Stride,
           int ParentWidth>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
-__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<T, M>
+__esimd_rdregion(sycl::ext::intel::gpu::vector_type_t<T, N> Input,
+                 uint16_t Offset) {
   uint16_t EltOffset = Offset / sizeof(T);
   assert(Offset % sizeof(T) == 0);
 
   int NumRows = M / Width;
   assert(M % Width == 0);
 
-  sycl::intel::gpu::vector_type_t<T, M> Result;
+  sycl::ext::intel::gpu::vector_type_t<T, M> Result;
   int Index = 0;
   for (int i = 0; i < NumRows; ++i) {
     for (int j = 0; j < Width; ++j) {
@@ -260,17 +266,17 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
 
 template <typename T, int N, int M, int VStride, int Width, int Stride,
           int ParentWidth>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
-__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
-                 sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
-                 sycl::intel::gpu::mask_type_t<M> Mask) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<T, N>
+__esimd_wrregion(sycl::ext::intel::gpu::vector_type_t<T, N> OldVal,
+                 sycl::ext::intel::gpu::vector_type_t<T, M> NewVal,
+                 uint16_t Offset, sycl::ext::intel::gpu::mask_type_t<M> Mask) {
   uint16_t EltOffset = Offset / sizeof(T);
   assert(Offset % sizeof(T) == 0);
 
   int NumRows = M / Width;
   assert(M % Width == 0);
 
-  sycl::intel::gpu::vector_type_t<T, N> Result = OldVal;
+  sycl::ext::intel::gpu::vector_type_t<T, N> Result = OldVal;
   int Index = 0;
   for (int i = 0; i < NumRows; ++i) {
     for (int j = 0; j < Width; ++j) {
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp
similarity index 90%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp
index c3f5a9d141305..ad091c82fc694 100644
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp
@@ -11,12 +11,12 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/detail/esimd_host_util.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_host_util.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
 #include <cstdint>
 
-using sycl::intel::gpu::vector_type_t;
+using sycl::ext::intel::gpu::vector_type_t;
 
 // saturation intrinsics
 template <typename T0, typename T1, int SZ>
@@ -210,39 +210,39 @@ SYCL_EXTERNAL vector_type_t<T1, N> __esimd_dp4a(vector_type_t<T2, N> src0,
 
 // Reduction functions
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_fmax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_umax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_umax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_smax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_smax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_fmin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_umin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_umin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-sycl::intel::gpu::vector_type_t<Ty, N> SYCL_EXTERNAL
-__esimd_reduced_smin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2);
+sycl::ext::intel::gpu::vector_type_t<Ty, N> SYCL_EXTERNAL
+__esimd_reduced_smin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2);
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_dp4(sycl::intel::gpu::vector_type_t<Ty, N> v1,
-            sycl::intel::gpu::vector_type_t<Ty, N> v2);
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_dp4(sycl::ext::intel::gpu::vector_type_t<Ty, N> v1,
+            sycl::ext::intel::gpu::vector_type_t<Ty, N> v2);
 
 #ifndef __SYCL_DEVICE_ONLY__
 
@@ -1096,10 +1096,10 @@ SYCL_EXTERNAL vector_type_t<T1, N> __esimd_dp4a(vector_type_t<T2, N> src0,
 };
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_max(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src2) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_max(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
   for (int I = 0; I < N; I++) {
     if (src1[I] >= src2[I]) {
       retv[I] = src1[I];
@@ -1111,31 +1111,31 @@ __esimd_reduced_max(sycl::intel::gpu::vector_type_t<Ty, N> src1,
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_fmax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_max<Ty, N>(src1, src2);
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_umax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_umax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_max<Ty, N>(src1, src2);
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_smax(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_smax(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_max<Ty, N>(src1, src2);
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_min(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src2) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_min(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
   for (int I = 0; I < N; I++) {
     if (src1[I] <= src2[I]) {
       retv[I] = src1[I];
@@ -1147,23 +1147,23 @@ __esimd_reduced_min(sycl::intel::gpu::vector_type_t<Ty, N> src1,
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_fmin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_min<Ty, N>(src1, src2);
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_umin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_umin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_min<Ty, N>(src1, src2);
 }
 
 template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_reduced_smin(sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src2) {
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_reduced_smin(sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src2) {
   return __esimd_reduced_min<Ty, N>(src1, src2);
 }
 
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp
new file mode 100644
index 0000000000000..e28dad78b048e
--- /dev/null
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp
@@ -0,0 +1,690 @@
+//==------------ esimd_memory_intrin.hpp - DPC++ Explicit SIMD API ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Declares Explicit SIMD intrinsics used to implement working with
+// the SIMD classes objects.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_util.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
+#include <cstdint>
+
+// flat_read does flat-address gather
+template <typename Ty, int N, int NumBlk = 0,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<
+    Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
+__esimd_flat_read(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                  int ElemsPerAddr = NumBlk,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// flat_write does flat-address scatter
+template <typename Ty, int N, int NumBlk = 0,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL void __esimd_flat_write(
+    sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<
+        Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
+        vals,
+    int ElemsPerAddr = NumBlk,
+    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// flat_block_read reads a block of data from one flat address
+template <typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_block_read_unaligned(uint64_t addr);
+
+// flat_block_write writes a block of data using one flat address
+template <typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL void
+__esimd_flat_block_write(uint64_t addr,
+                         sycl::ext::intel::gpu::vector_type_t<Ty, N> vals);
+
+// Reads a block of data from given surface at given offset.
+template <typename Ty, int N, typename SurfIndAliasTy>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset);
+
+// Writes given block of data to a surface with given index at given offset.
+template <typename Ty, int N, typename SurfIndAliasTy>
+SYCL_EXTERNAL void
+__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> vals);
+
+// flat_read4 does flat-address gather4
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> SYCL_EXTERNAL
+__esimd_flat_read4(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                   sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// flat_write does flat-address scatter
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL void __esimd_flat_write4(
+    sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
+    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// flat_atomic: flat-address atomic
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic0(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic1(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H =
+              sycl::ext::intel::gpu::CacheHint::None,
+          sycl::ext::intel::gpu::CacheHint L3H =
+              sycl::ext::intel::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic2(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+// esimd_barrier, generic group barrier
+SYCL_EXTERNAL void __esimd_barrier();
+
+// slm_fence sets the SLM read/write order
+SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl);
+
+// slm_read does SLM gather
+template <typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_read(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                 sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// slm_write does SLM scatter
+template <typename Ty, int N>
+SYCL_EXTERNAL void
+__esimd_slm_write(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                  sycl::ext::intel::gpu::vector_type_t<Ty, N> vals,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// slm_block_read reads a block of data from SLM
+template <typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_block_read(uint32_t addr);
+
+// slm_block_write writes a block of data to SLM
+template <typename Ty, int N>
+SYCL_EXTERNAL void
+__esimd_slm_block_write(uint32_t addr,
+                        sycl::ext::intel::gpu::vector_type_t<Ty, N> vals);
+
+// slm_read4 does SLM gather4
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
+__esimd_slm_read4(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// slm_write4 does SLM scatter4
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask>
+SYCL_EXTERNAL void __esimd_slm_write4(
+    sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
+    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
+
+// slm_atomic: SLM atomic
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic0(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic1(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic2(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred);
+
+// Media block load
+//
+// @param Ty the element data type.
+//
+// @param M the hight of the 2D block.
+//
+// @param N the width of the 2D block.
+//
+// @param TACC type of the surface handle.
+//
+// @param modifier top/bottom field surface access control.
+//
+// @param handle the surface handle.
+//
+// @param plane planar surface index.
+//
+// @param width the width of the return block.
+//
+// @param x X-coordinate of the left upper rectangle corner in BYTES.
+//
+// @param y Y-coordinate of the left upper rectangle corner in ROWS.
+//
+// @return the linearized 2D block data read from surface.
+//
+template <typename Ty, int M, int N, typename TACC>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, M * N>
+__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane,
+                         unsigned width, unsigned x, unsigned y);
+
+// Media block store
+//
+// @param Ty the element data type.
+//
+// @param M the hight of the 2D block.
+//
+// @param N the width of the 2D block.
+//
+// @param TACC type of the surface handle.
+//
+// @param modifier top/bottom field surface access control.
+//
+// @param handle the surface handle.
+//
+// @param plane planar surface index.
+//
+// @param width the width of the return block.
+//
+// @param x X-coordinate of the left upper rectangle corner in BYTES.
+//
+// @param y Y-coordinate of the left upper rectangle corner in ROWS.
+//
+// @param vals the linearized 2D block data to be written to surface.
+//
+template <typename Ty, int M, int N, typename TACC>
+SYCL_EXTERNAL void
+__esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane,
+                          unsigned width, unsigned x, unsigned y,
+                          sycl::ext::intel::gpu::vector_type_t<Ty, M * N> vals);
+
+#ifndef __SYCL_DEVICE_ONLY__
+
+template <typename Ty, int N, int NumBlk, sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<
+    Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
+__esimd_flat_read(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                  int ElemsPerAddr,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  auto NumBlkDecoded = sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk);
+  sycl::ext::intel::gpu::vector_type_t<
+      Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
+      V;
+  ElemsPerAddr = sycl::ext::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr);
+
+  for (int I = 0; I < N; I++) {
+    if (pred[I]) {
+      Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
+      if (sizeof(Ty) == 2)
+        ElemsPerAddr = ElemsPerAddr / 2;
+      if (sizeof(Ty) <= 2) {
+        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
+          V[I * NumBlkDecoded + J] = *(Addr + J);
+      } else {
+        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
+          V[J * N + I] = *(Addr + J);
+      }
+    }
+  }
+  return V;
+}
+
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask,
+          sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
+__esimd_flat_read4(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                   sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> V;
+  unsigned int Next = 0;
+
+  if constexpr (HasR(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
+        V[Next] = *Addr;
+      }
+    }
+  }
+
+  if constexpr (HasG(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty));
+        V[Next] = *Addr;
+      }
+    }
+  }
+
+  if constexpr (HasB(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty));
+        V[Next] = *Addr;
+      }
+    }
+  }
+
+  if constexpr (HasA(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty) +
+                                          sizeof(Ty));
+        V[Next] = *Addr;
+      }
+    }
+  }
+
+  return V;
+}
+
+template <typename Ty, int N, int NumBlk, sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL void __esimd_flat_write(
+    sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<
+        Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
+        vals,
+    int ElemsPerAddr, sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  auto NumBlkDecoded = sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk);
+  ElemsPerAddr = sycl::ext::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr);
+
+  for (int I = 0; I < N; I++) {
+    if (pred[I]) {
+      Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
+      if (sizeof(Ty) == 2)
+        ElemsPerAddr = ElemsPerAddr / 2;
+      if (sizeof(Ty) <= 2) {
+        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
+          *(Addr + J) = vals[I * NumBlkDecoded + J];
+      } else {
+        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
+          *(Addr + J) = vals[J * N + I];
+      }
+    }
+  }
+}
+
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask,
+          sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL void __esimd_flat_write4(
+    sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
+    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> V;
+  unsigned int Next = 0;
+
+  if constexpr (HasR(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
+        *Addr = vals[Next];
+      }
+    }
+  }
+
+  if constexpr (HasG(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty));
+        *Addr = vals[Next];
+      }
+    }
+  }
+
+  if constexpr (HasB(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty));
+        *Addr = vals[Next];
+      }
+    }
+  }
+
+  if constexpr (HasA(Mask)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty) +
+                                          sizeof(Ty));
+        *Addr = vals[Next];
+      }
+    }
+  }
+}
+
+template <typename Ty, int N, sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_block_read_unaligned(uint64_t addr) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> V;
+
+  for (int I = 0; I < N; I++) {
+    Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
+    V[I] = *Addr;
+  }
+  return V;
+}
+
+template <typename Ty, int N, sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL void
+__esimd_flat_block_write(uint64_t addr,
+                         sycl::ext::intel::gpu::vector_type_t<Ty, N> vals) {
+  for (int I = 0; I < N; I++) {
+    Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
+    *Addr = vals[I];
+  }
+}
+
+template <typename Ty, int M, int N, typename TACC>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, M * N>
+__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane,
+                         unsigned width, unsigned x, unsigned y) {
+  // On host the input surface is modeled as sycl image 2d object,
+  // and the read/write access is done through accessor,
+  // which is passed in as the handle argument.
+  auto range =
+      sycl::ext::intel::gpu::AccessorPrivateProxy::getImageRange(handle);
+  unsigned bpp =
+      sycl::ext::intel::gpu::AccessorPrivateProxy::getElemSize(handle);
+  unsigned vpp = bpp / sizeof(Ty);
+  unsigned int i = x / bpp;
+  unsigned int j = y;
+
+  assert(x % bpp == 0);
+  unsigned int xbound = range[0] - 1;
+  unsigned int ybound = range[1] - 1;
+
+  sycl::ext::intel::gpu::vector_type_t<Ty, M * N> vals;
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col += vpp) {
+      unsigned int xoff = (i > xbound) ? xbound : i;
+      unsigned int yoff = (j > ybound) ? ybound : j;
+      auto coords = cl::sycl::cl_int2(xoff, yoff);
+      cl::sycl::cl_uint4 data = handle.read(coords);
+
+      sycl::ext::intel::gpu::vector_type_t<unsigned int, 4> res;
+      for (int idx = 0; idx < 4; idx++) {
+        res[idx] = data[idx];
+      }
+
+      constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
+      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
+      using refTy = sycl::ext::intel::gpu::vector_type_t<Ty, refN>;
+      auto ref = reinterpret_cast<refTy>(res);
+
+      unsigned int offset1 = col + row * N;
+      unsigned int offset2 = 0;
+      for (int idx = 0; idx < vpp; idx++) {
+        vals[offset1] = ref[offset2];
+        offset1++;
+        offset2 += stride;
+      }
+      i++;
+    }
+    i = x / bpp;
+    j++;
+  }
+
+  return vals;
+}
+
+template <typename Ty, int M, int N, typename TACC>
+SYCL_EXTERNAL void __esimd_media_block_store(
+    unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x,
+    unsigned y, sycl::ext::intel::gpu::vector_type_t<Ty, M * N> vals) {
+  unsigned bpp =
+      sycl::ext::intel::gpu::AccessorPrivateProxy::getElemSize(handle);
+  unsigned vpp = bpp / sizeof(Ty);
+  auto range =
+      sycl::ext::intel::gpu::AccessorPrivateProxy::getImageRange(handle);
+  unsigned int i = x / bpp;
+  unsigned int j = y;
+
+  assert(x % bpp == 0);
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col += vpp) {
+      constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
+      sycl::ext::intel::gpu::vector_type_t<Ty, Sz> res = 0;
+
+      unsigned int offset1 = col + row * N;
+      unsigned int offset2 = 0;
+      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
+      for (int idx = 0; idx < vpp; idx++) {
+        res[offset2] = vals[offset1];
+        offset1++;
+        offset2 += stride;
+      }
+
+      using refTy = sycl::ext::intel::gpu::vector_type_t<unsigned int, 4>;
+      auto ref = reinterpret_cast<refTy>(res);
+
+      cl::sycl::cl_uint4 data;
+      for (int idx = 0; idx < 4; idx++) {
+        data[idx] = ref[idx];
+      }
+
+      if (i < range[0] && j < range[1]) {
+        auto coords = cl::sycl::cl_int2(i, j);
+        handle.write(coords, data);
+      }
+      i++;
+    }
+    i = x / bpp;
+    j++;
+  }
+}
+
+template <typename Ty, int N>
+SYCL_EXTERNAL uint16_t
+__esimd_any(sycl::ext::intel::gpu::vector_type_t<Ty, N> src) {
+  for (unsigned int i = 0; i != N; i++) {
+    if (src[i] != 0)
+      return 1;
+  }
+  return 0;
+}
+
+template <typename Ty, int N>
+SYCL_EXTERNAL uint16_t
+__esimd_all(sycl::ext::intel::gpu::vector_type_t<Ty, N> src) {
+  for (unsigned int i = 0; i != N; i++) {
+    if (src[i] == 0)
+      return 0;
+  }
+  return 1;
+}
+
+template <typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_dp4(sycl::ext::intel::gpu::vector_type_t<Ty, N> v1,
+            sycl::ext::intel::gpu::vector_type_t<Ty, N> v2) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  for (auto i = 0; i != N; i += 4) {
+    Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) +
+            (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]);
+    retv[i] = dp;
+    retv[i + 1] = dp;
+    retv[i + 2] = dp;
+    retv[i + 3] = dp;
+  }
+  return retv;
+}
+
+/// TODO
+SYCL_EXTERNAL void __esimd_barrier() {}
+
+SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl) {}
+
+template <typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_read(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                 sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+// slm_write does SLM scatter
+template <typename Ty, int N>
+SYCL_EXTERNAL void
+__esimd_slm_write(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                  sycl::ext::intel::gpu::vector_type_t<Ty, N> vals,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {}
+
+// slm_block_read reads a block of data from SLM
+template <typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_block_read(uint32_t addr) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+// slm_block_write writes a block of data to SLM
+template <typename Ty, int N>
+SYCL_EXTERNAL void
+__esimd_slm_block_write(uint32_t addr,
+                        sycl::ext::intel::gpu::vector_type_t<Ty, N> vals) {}
+
+// slm_read4 does SLM gather4
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
+__esimd_slm_read4(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                  sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> retv;
+  return retv;
+}
+
+// slm_write4 does SLM scatter4
+template <typename Ty, int N, sycl::ext::intel::gpu::ChannelMaskType Mask>
+SYCL_EXTERNAL void __esimd_slm_write4(
+    sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+    sycl::ext::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
+    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {}
+
+// slm_atomic: SLM atomic
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic0(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic1(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_slm_atomic2(sycl::ext::intel::gpu::vector_type_t<uint32_t, N> addrs,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                    sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic0(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic1(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <sycl::ext::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
+          sycl::ext::intel::gpu::CacheHint L1H,
+          sycl::ext::intel::gpu::CacheHint L3H>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_flat_atomic2(sycl::ext::intel::gpu::vector_type_t<uint64_t, N> addrs,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src0,
+                     sycl::ext::intel::gpu::vector_type_t<Ty, N> src1,
+                     sycl::ext::intel::gpu::vector_type_t<uint16_t, N> pred) {
+  sycl::ext::intel::gpu::vector_type_t<Ty, N> retv;
+  return retv;
+}
+
+template <typename Ty, int N, typename SurfIndAliasTy>
+SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t<Ty, N>
+__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) {
+  throw cl::sycl::feature_not_supported();
+  return sycl::ext::intel::gpu::vector_type_t<Ty, N>();
+}
+
+template <typename Ty, int N, typename SurfIndAliasTy>
+SYCL_EXTERNAL void
+__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset,
+                    sycl::ext::intel::gpu::vector_type_t<Ty, N> vals) {
+
+  throw cl::sycl::feature_not_supported();
+}
+
+#endif // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp
similarity index 99%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp
index c1576415a882b..39910609e7942 100644
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp
@@ -17,6 +17,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -115,5 +116,6 @@ template <typename T, typename U> T getBaseRegion(std::pair<T, U> Reg) {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp
index 7ff12e9113dda..eb6b1fd87914c 100644
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp
@@ -12,14 +12,15 @@
 
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/stl_type_traits.hpp> // to define C++14,17 extensions
+#include <CL/sycl/ext/intel/esimd/detail/esimd_region.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
 #include <CL/sycl/half_type.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_region.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
 #include <cstdint>
 #include <type_traits>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -257,5 +258,6 @@ inline std::istream &operator>>(std::istream &I, half &rhs) {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
similarity index 99%
rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
index 54d769db75b4b..e0b7323a8bafe 100755
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
@@ -41,6 +41,7 @@ static ESIMD_INLINE constexpr unsigned log2(unsigned n) {
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -211,5 +212,6 @@ template <> struct word_type<uint> { using type = ushort; };
 } // namespace details
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/esimd.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
similarity index 99%
rename from sycl/include/CL/sycl/intel/esimd/esimd.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
index 757055dfa00fe..c24cac7ad0b67 100644
--- a/sycl/include/CL/sycl/intel/esimd/esimd.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
@@ -10,11 +10,12 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/detail/esimd_intrin.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -445,6 +446,7 @@ ESIMD_INLINE simd<U, n> convert(simd<T, n> val) {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp
index 4b901ea079119..626d6002af35a 100644
--- a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp
@@ -15,6 +15,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -107,5 +108,6 @@ enum class CacheHint : uint8_t {
 } // namespace gpu
 
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_math.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp
similarity index 100%
rename from sycl/include/CL/sycl/intel/esimd/esimd_math.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp
diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
index 84d175e981595..0dda839f89a16 100644
--- a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
@@ -11,15 +11,16 @@
 #pragma once
 
 #include <CL/sycl/half_type.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_util.hpp>
-#include <CL/sycl/intel/esimd/esimd.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_util.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
 #include <cstdint>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -645,5 +646,6 @@ SYCL_EXTERNAL void slm_init(uint32_t size) {}
 #endif
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_view.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
similarity index 99%
rename from sycl/include/CL/sycl/intel/esimd/esimd_view.hpp
rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
index 57338a0c51e86..abded4def0c25 100644
--- a/sycl/include/CL/sycl/intel/esimd/esimd_view.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
@@ -14,6 +14,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -381,5 +382,6 @@ template <typename BaseTy, typename RegionTy> class simd_view {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp b/sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp
similarity index 97%
rename from sycl/include/CL/sycl/intel/fpga_device_selector.hpp
rename to sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp
index d5f9cab31180c..83d9e7683bdf2 100644
--- a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp
@@ -12,6 +12,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 
 class platform_selector : public device_selector {
@@ -48,5 +49,6 @@ class fpga_emulator_selector : public platform_selector {
 };
 
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/fpga_extensions.hpp b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
similarity index 73%
rename from sycl/include/CL/sycl/intel/fpga_extensions.hpp
rename to sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
index a9fca1e6139d2..08975ae5051af 100644
--- a/sycl/include/CL/sycl/intel/fpga_extensions.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
@@ -7,6 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl/intel/fpga_device_selector.hpp>
-#include <CL/sycl/intel/fpga_reg.hpp>
-#include <CL/sycl/intel/pipes.hpp>
+#include <CL/sycl/ext/intel/fpga_device_selector.hpp>
+#include <CL/sycl/ext/intel/fpga_reg.hpp>
+#include <CL/sycl/ext/oneapi/pipes.hpp>
diff --git a/sycl/include/CL/sycl/intel/fpga_reg.hpp b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
similarity index 96%
rename from sycl/include/CL/sycl/intel/fpga_reg.hpp
rename to sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
index 0078dd66c383c..f183d420ca2d0 100644
--- a/sycl/include/CL/sycl/intel/fpga_reg.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
@@ -12,6 +12,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 
 template <typename T> T fpga_reg(const T &t) {
@@ -23,6 +24,7 @@ template <typename T> T fpga_reg(const T &t) {
 }
 
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
diff --git a/sycl/include/CL/sycl/intel/atomic.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic.hpp
similarity index 73%
rename from sycl/include/CL/sycl/intel/atomic.hpp
rename to sycl/include/CL/sycl/ext/oneapi/atomic.hpp
index bbc49ecc210d9..7712c4071a8e5 100644
--- a/sycl/include/CL/sycl/intel/atomic.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic.hpp
@@ -8,6 +8,6 @@
 
 #pragma once
 
-#include <CL/sycl/intel/atomic_enums.hpp>
-#include <CL/sycl/intel/atomic_fence.hpp>
-#include <CL/sycl/intel/atomic_ref.hpp>
+#include <CL/sycl/ext/oneapi/atomic_enums.hpp>
+#include <CL/sycl/ext/oneapi/atomic_fence.hpp>
+#include <CL/sycl/ext/oneapi/atomic_ref.hpp>
diff --git a/sycl/include/CL/sycl/intel/atomic_enums.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_enums.hpp
similarity index 94%
rename from sycl/include/CL/sycl/intel/atomic_enums.hpp
rename to sycl/include/CL/sycl/ext/oneapi/atomic_enums.hpp
index a85c9902cd524..3c48f4d5b52bd 100644
--- a/sycl/include/CL/sycl/intel/atomic_enums.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_enums.hpp
@@ -20,7 +20,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 enum class memory_order : int {
   relaxed,
@@ -63,7 +64,7 @@ namespace detail {
 // Nested ternary conditions in else branch required for C++11
 #if __cplusplus >= 201402L
 static inline constexpr std::memory_order
-getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
+getStdMemoryOrder(::cl::sycl::ext::oneapi::memory_order order) {
   switch (order) {
   case memory_order::relaxed:
     return std::memory_order_relaxed;
@@ -81,7 +82,7 @@ getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
 }
 #else
 static inline constexpr std::memory_order
-getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
+getStdMemoryOrder(::cl::sycl::ext::oneapi::memory_order order) {
   return (order == memory_order::relaxed)
              ? std::memory_order_relaxed
              : (order == memory_order::__consume_unsupported)
@@ -98,6 +99,7 @@ getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
 } // namespace detail
 #endif // __SYCL_DEVICE_ONLY__
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/atomic_fence.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_fence.hpp
similarity index 89%
rename from sycl/include/CL/sycl/intel/atomic_fence.hpp
rename to sycl/include/CL/sycl/ext/oneapi/atomic_fence.hpp
index aba95c060b878..a5089efe44db2 100644
--- a/sycl/include/CL/sycl/intel/atomic_fence.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_fence.hpp
@@ -10,7 +10,7 @@
 
 #include <CL/__spirv/spirv_ops.hpp>
 #include <CL/sycl/detail/spirv.hpp>
-#include <CL/sycl/intel/atomic_enums.hpp>
+#include <CL/sycl/ext/oneapi/atomic_enums.hpp>
 
 #ifndef __SYCL_DEVICE_ONLY__
 #include <atomic>
@@ -18,7 +18,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 namespace detail {
 using namespace cl::sycl::detail;
 }
@@ -35,6 +36,7 @@ static inline void atomic_fence(memory_order order, memory_scope scope) {
 #endif
 }
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/atomic_ref.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/atomic_ref.hpp
rename to sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
index f6e8d4ff68616..833ae4fe8e924 100644
--- a/sycl/include/CL/sycl/intel/atomic_ref.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
@@ -13,7 +13,7 @@
 #include <CL/sycl/atomic.hpp>
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/helpers.hpp>
-#include <CL/sycl/intel/atomic_enums.hpp>
+#include <CL/sycl/ext/oneapi/atomic_enums.hpp>
 
 #ifndef __SYCL_DEVICE_ONLY__
 #include <atomic>
@@ -27,14 +27,15 @@ namespace sycl {
 template <typename pointerT, access::address_space AddressSpace>
 class multi_ptr;
 
-namespace intel {
+namespace ext {
+namespace oneapi {
 namespace detail {
 
-// Import from detail:: into intel::detail:: to improve readability later
+// Import from detail:: into oneapi::detail:: to improve readability later
 using namespace ::cl::sycl::detail;
 
-using memory_order = cl::sycl::intel::memory_order;
-using memory_scope = cl::sycl::intel::memory_scope;
+using memory_order = cl::sycl::ext::oneapi::memory_order;
+using memory_scope = cl::sycl::ext::oneapi::memory_scope;
 
 template <typename T>
 using IsValidAtomicType =
@@ -527,6 +528,7 @@ class atomic_ref : public detail::atomic_ref_impl<T, DefaultOrder, DefaultScope,
                                 AddressSpace>::atomic_ref_impl;
 };
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/builtins.hpp b/sycl/include/CL/sycl/ext/oneapi/builtins.hpp
similarity index 81%
rename from sycl/include/CL/sycl/intel/builtins.hpp
rename to sycl/include/CL/sycl/ext/oneapi/builtins.hpp
index a59258a2290ba..90d5ee5ff4913 100644
--- a/sycl/include/CL/sycl/intel/builtins.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/builtins.hpp
@@ -18,8 +18,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
-namespace experimental {
+namespace ext {
+namespace oneapi {
 
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
@@ -30,9 +30,9 @@ namespace experimental {
 // Please refer to corresponding section in OpenCL C specification to find
 // information about format string and its differences from standard C rules.
 //
-// This function is placed under 'experimental' namespace on purpose, because it
-// has too much caveats you need to be aware of before using it. Please find
-// them below and read carefully before using it:
+// This function is placed under 'experimental' namespace on purpose, because
+// it has too much caveats you need to be aware of before using it. Please
+// find them below and read carefully before using it:
 //
 // - According to the OpenCL spec, the format string must be
 // resolvable at compile time i.e. cannot be dynamically created by the
@@ -43,14 +43,14 @@ namespace experimental {
 // test/built-ins/printf.cpp for examples
 // FIXME: this potentially can be done on SYCL FE side automatically
 //
-// - The format string is interpreted according to the OpenCL C spec, where all
-// data types has fixed size, opposed to C++ types which doesn't guarantee
+// - The format string is interpreted according to the OpenCL C spec, where
+// all data types has fixed size, opposed to C++ types which doesn't guarantee
 // the exact width of particular data types (except, may be, char). This might
 // lead to unexpected result, for example: %ld in OpenCL C means that printed
-// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However,
-// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by
-// performing a cast, for example) that if you use %ld specifier, you pass
-// 64-bit argument to the cl::sycl::experimental::printf
+// argument has 'long' type which is 64-bit wide by the OpenCL C spec.
+// However, by C++ spec long is just at least 32-bit wide, so, you need to
+// ensure (by performing a cast, for example) that if you use %ld specifier,
+// you pass 64-bit argument to the cl::sycl::experimental::printf
 //
 // - OpenCL spec defines several additional features, like, for example, 'v'
 // modifier which allows to print OpenCL vectors: note that these features are
@@ -67,8 +67,8 @@ int printf(const CONSTANT_AS char *__format, Args... args) {
 #endif
 }
 
-} // namespace experimental
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
diff --git a/sycl/include/CL/sycl/intel/function_pointer.hpp b/sycl/include/CL/sycl/ext/oneapi/function_pointer.hpp
similarity index 97%
rename from sycl/include/CL/sycl/intel/function_pointer.hpp
rename to sycl/include/CL/sycl/ext/oneapi/function_pointer.hpp
index f812be911b788..5f664318d0417 100644
--- a/sycl/include/CL/sycl/intel/function_pointer.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/function_pointer.hpp
@@ -21,7 +21,8 @@ namespace detail {
 __SYCL_EXPORT cl_ulong getDeviceFunctionPointerImpl(device &D, program &P,
                                                     const char *FuncName);
 }
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 // This is a preview extension implementation, intended to provide early
 // access to a feature for review and community feedback.
@@ -83,6 +84,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName,
 
   return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName);
 }
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/functional.hpp b/sycl/include/CL/sycl/ext/oneapi/functional.hpp
similarity index 83%
rename from sycl/include/CL/sycl/intel/functional.hpp
rename to sycl/include/CL/sycl/ext/oneapi/functional.hpp
index ee4ed21b33ffd..96c84314a939a 100644
--- a/sycl/include/CL/sycl/intel/functional.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/functional.hpp
@@ -11,7 +11,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 template <typename T = void> struct minimum {
   T operator()(const T &lhs, const T &rhs) const {
@@ -57,7 +58,8 @@ template <typename T = void> using bit_or = std::bit_or<T>;
 template <typename T = void> using bit_xor = std::bit_xor<T>;
 template <typename T = void> using bit_and = std::bit_and<T>;
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 
 #ifdef __SYCL_DEVICE_ONLY__
 namespace detail {
@@ -93,15 +95,15 @@ struct GroupOpTag<T, detail::enable_if_t<detail::is_sgenfloat<T>::value>> {
     return Ret;                                                                \
   }
 
-__SYCL_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum<T>)
-__SYCL_CALC_OVERLOAD(GroupOpISigned, IAdd, intel::plus<T>)
-__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, intel::plus<T>)
-__SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, intel::plus<T>)
+__SYCL_CALC_OVERLOAD(GroupOpISigned, SMin, ext::oneapi::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMin, ext::oneapi::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FMin, ext::oneapi::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpISigned, SMax, ext::oneapi::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMax, ext::oneapi::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FMax, ext::oneapi::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpISigned, IAdd, ext::oneapi::plus<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, ext::oneapi::plus<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, ext::oneapi::plus<T>)
 
 #undef __SYCL_CALC_OVERLOAD
 
diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/group_algorithm.hpp
rename to sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
index e49ed1592e4da..5dfa09e6418e5 100644
--- a/sycl/include/CL/sycl/intel/group_algorithm.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
@@ -12,9 +12,9 @@
 #include <CL/__spirv/spirv_vars.hpp>
 #include <CL/sycl/detail/spirv.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
+#include <CL/sycl/ext/oneapi/functional.hpp>
+#include <CL/sycl/ext/oneapi/sub_group.hpp>
 #include <CL/sycl/group.hpp>
-#include <CL/sycl/intel/functional.hpp>
-#include <CL/sycl/intel/sub_group.hpp>
 
 #ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -32,7 +32,8 @@ template <> inline size_t get_local_linear_range<group<3>>(group<3> g) {
   return g.get_local_range(0) * g.get_local_range(1) * g.get_local_range(2);
 }
 template <>
-inline size_t get_local_linear_range<intel::sub_group>(intel::sub_group g) {
+inline size_t
+get_local_linear_range<intel::sub_group>(ext::oneapi::sub_group g) {
   return g.get_local_range()[0];
 }
 
@@ -53,8 +54,8 @@ __SYCL_GROUP_GET_LOCAL_LINEAR_ID(3);
 #endif // __SYCL_DEVICE_ONLY__
 
 template <>
-inline intel::sub_group::linear_id_type
-get_local_linear_id<intel::sub_group>(intel::sub_group g) {
+inline ext::oneapi::sub_group::linear_id_type
+get_local_linear_id<ext::oneapi::sub_group>(ext::oneapi::sub_group g) {
   return g.get_local_id()[0];
 }
 
@@ -79,15 +80,15 @@ template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) {
 
 template <typename T, class BinaryOperation> struct identity {};
 
-template <typename T, typename V> struct identity<T, intel::plus<V>> {
+template <typename T, typename V> struct identity<T, ext::oneapi::plus<V>> {
   static constexpr T value = 0;
 };
 
-template <typename T, typename V> struct identity<T, intel::minimum<V>> {
+template <typename T, typename V> struct identity<T, ext::oneapi::minimum<V>> {
   static constexpr T value = (std::numeric_limits<T>::max)();
 };
 
-template <typename T, typename V> struct identity<T, intel::maximum<V>> {
+template <typename T, typename V> struct identity<T, ext::oneapi::maximum<V>> {
   static constexpr T value = std::numeric_limits<T>::lowest();
 };
 
@@ -112,7 +113,8 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) {
 
 } // namespace detail
 
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 template <typename T>
 using EnableIfIsScalarArithmetic = cl::sycl::detail::enable_if_t<
@@ -822,7 +824,8 @@ template <typename Group> bool leader(Group g) {
 #endif
 }
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 #endif // __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
diff --git a/sycl/include/CL/sycl/intel/pipes.hpp b/sycl/include/CL/sycl/ext/oneapi/pipes.hpp
similarity index 98%
rename from sycl/include/CL/sycl/intel/pipes.hpp
rename to sycl/include/CL/sycl/ext/oneapi/pipes.hpp
index 8396bc1e215fc..cfa906654a4b8 100644
--- a/sycl/include/CL/sycl/intel/pipes.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/pipes.hpp
@@ -14,7 +14,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 template <class name, class dataT, int32_t min_capacity = 0> class pipe {
 public:
@@ -198,6 +199,7 @@ class kernel_writeable_io_pipe {
 #endif // __SYCL_DEVICE_ONLY__
 };
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
similarity index 95%
rename from sycl/include/CL/sycl/intel/reduction.hpp
rename to sycl/include/CL/sycl/ext/oneapi/reduction.hpp
index 01b44cb429d6b..79f0f66677aa4 100644
--- a/sycl/include/CL/sycl/intel/reduction.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
@@ -9,11 +9,12 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
-#include <CL/sycl/intel/group_algorithm.hpp>
+#include <CL/sycl/ext/oneapi/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 namespace detail {
 
@@ -27,8 +28,8 @@ using cl::sycl::detail::remove_AS;
 
 template <typename T, class BinaryOperation>
 using IsReduPlus = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::plus<T>>::value ||
-    std::is_same<BinaryOperation, intel::plus<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::plus<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::plus<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduMultiplies = detail::bool_constant<
@@ -37,28 +38,28 @@ using IsReduMultiplies = detail::bool_constant<
 
 template <typename T, class BinaryOperation>
 using IsReduMinimum = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::minimum<T>>::value ||
-    std::is_same<BinaryOperation, intel::minimum<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::minimum<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::minimum<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduMaximum = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::maximum<T>>::value ||
-    std::is_same<BinaryOperation, intel::maximum<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::maximum<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::maximum<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduBitOR = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
-    std::is_same<BinaryOperation, intel::bit_or<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::bit_or<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::bit_or<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduBitXOR = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::bit_xor<T>>::value ||
-    std::is_same<BinaryOperation, intel::bit_xor<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::bit_xor<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::bit_xor<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduBitAND = detail::bool_constant<
-    std::is_same<BinaryOperation, intel::bit_and<T>>::value ||
-    std::is_same<BinaryOperation, intel::bit_and<void>>::value>;
+    std::is_same<BinaryOperation, ext::oneapi::bit_and<T>>::value ||
+    std::is_same<BinaryOperation, ext::oneapi::bit_and<void>>::value>;
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastAtomicFetch =
@@ -166,7 +167,7 @@ class reducer {
 /// using those operations, which are based on functionality provided by
 /// sycl::atomic class.
 ///
-/// For example, it is known that 0 is identity for intel::plus operations
+/// For example, it is known that 0 is identity for ext::oneapi::plus operations
 /// accepting native scalar types to which scalar 0 is convertible.
 /// Also, for int32/64 types the atomic_combine() is lowered to
 /// sycl::atomic::fetch_add().
@@ -308,7 +309,8 @@ class reducer<T, BinaryOperation,
         .fetch_and(MValue);
   }
 
-  /// Atomic MIN operation: *ReduVarPtr = intel::minimum(*ReduVarPtr, MValue);
+  /// Atomic MIN operation: *ReduVarPtr = ext::oneapi::minimum(*ReduVarPtr,
+  /// MValue);
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
@@ -318,7 +320,8 @@ class reducer<T, BinaryOperation,
         .fetch_min(MValue);
   }
 
-  /// Atomic MAX operation: *ReduVarPtr = intel::maximum(*ReduVarPtr, MValue);
+  /// Atomic MAX operation: *ReduVarPtr = ext::oneapi::maximum(*ReduVarPtr,
+  /// MValue);
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
@@ -599,11 +602,12 @@ struct get_reduction_aux_kernel_name_t {
 /// Implements a command group function that enqueues a kernel that calls
 /// user's lambda function KernelFunc and also does one iteration of reduction
 /// of elements computed in user's lambda function.
-/// This version uses intel::reduce() algorithm to reduce elements in each
+/// This version uses ext::oneapi::reduce() algorithm to reduce elements in each
 /// of work-groups, then it calls fast sycl atomic operations to update
 /// user's reduction variable.
 ///
-/// Briefly: calls user's lambda, intel::reduce() + atomic, INT + ADD/MIN/MAX.
+/// Briefly: calls user's lambda, ext::oneapi::reduce() + atomic, INT +
+/// ADD/MIN/MAX.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           bool UniformWG, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
@@ -622,7 +626,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
         (UniformWG || NDIt.get_global_linear_id() < NWorkItems)
             ? Reducer.MValue
             : Reducer.getIdentity();
-    Reducer.MValue = intel::reduce(NDIt.get_group(), Val, BOp);
+    Reducer.MValue = ext::oneapi::reduce(NDIt.get_group(), Val, BOp);
     if (NDIt.get_local_linear_id() == 0)
       Reducer.atomic_combine(Reduction::getOutPointer(Out));
   });
@@ -716,11 +720,11 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 /// Implements a command group function that enqueues a kernel that
 /// calls user's lambda function and does one iteration of reduction
 /// of elements in each of work-groups.
-/// This version uses intel::reduce() algorithm to reduce elements in each
+/// This version uses ext::oneapi::reduce() algorithm to reduce elements in each
 /// of work-groups. At the end of each work-groups the partial sum is written
 /// to a global buffer.
 ///
-/// Briefly: user's lambda, intel:reduce(), FP + ADD/MIN/MAX.
+/// Briefly: user's lambda, ext::oneapi:reduce(), FP + ADD/MIN/MAX.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           bool UniformWG, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
@@ -750,7 +754,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
             ? Reducer.MValue
             : Reducer.getIdentity();
     typename Reduction::binary_operation BOp;
-    PSum = intel::reduce(NDIt.get_group(), PSum, BOp);
+    PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
     if (NDIt.get_local_linear_id() == 0) {
       if (IsUpdateOfUserVar)
         PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
@@ -863,11 +867,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 
 /// Implements a command group function that enqueues a kernel that does one
 /// iteration of reduction of elements in each of work-groups.
-/// This version uses intel::reduce() algorithm to reduce elements in each
+/// This version uses ext::oneapi::reduce() algorithm to reduce elements in each
 /// of work-groups. At the end of each work-groups the partial sum is written
 /// to a global buffer.
 ///
-/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
+/// Briefly: aux kernel, ext::oneapi:reduce(), reproducible results,FP +
+/// ADD/MIN/MAX
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           bool UniformWG, typename InputT, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
@@ -887,7 +892,7 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
         (UniformWG || (GID < NWorkItems))
             ? In[GID]
             : Reduction::reducer_type::getIdentity();
-    PSum = intel::reduce(NDIt.get_group(), PSum, BOp);
+    PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
     if (NDIt.get_local_linear_id() == 0) {
       if (IsUpdateOfUserVar)
         PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
@@ -1054,6 +1059,7 @@ reduction(T *VarPtr, BinaryOperation) {
                                 access::mode::read_write>(VarPtr);
 }
 
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/experimental/spec_constant.hpp b/sycl/include/CL/sycl/ext/oneapi/spec_constant.hpp
similarity index 95%
rename from sycl/include/CL/sycl/experimental/spec_constant.hpp
rename to sycl/include/CL/sycl/ext/oneapi/spec_constant.hpp
index 104137fdba9c5..99eea95e39cb4 100644
--- a/sycl/include/CL/sycl/experimental/spec_constant.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/spec_constant.hpp
@@ -22,7 +22,8 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace experimental {
+namespace ext {
+namespace oneapi {
 
 class spec_const_error : public compile_program_error {
   using compile_program_error::compile_program_error;
@@ -56,6 +57,7 @@ template <typename T, typename ID = T> class spec_constant {
   }
 };
 
-} // namespace experimental
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/ext/oneapi/sub_group.hpp
similarity index 97%
rename from sycl/include/CL/sycl/intel/sub_group.hpp
rename to sycl/include/CL/sycl/ext/oneapi/sub_group.hpp
index 2c65f08218990..62475b1f230ff 100644
--- a/sycl/include/CL/sycl/intel/sub_group.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/sub_group.hpp
@@ -16,8 +16,8 @@
 #include <CL/sycl/detail/helpers.hpp>
 #include <CL/sycl/detail/spirv.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
+#include <CL/sycl/ext/oneapi/functional.hpp>
 #include <CL/sycl/id.hpp>
-#include <CL/sycl/intel/functional.hpp>
 #include <CL/sycl/range.hpp>
 #include <CL/sycl/types.hpp>
 
@@ -96,7 +96,8 @@ void store(multi_ptr<T, Space> dst, const vec<T, N> &x) {
 
 } // namespace detail
 
-namespace intel {
+namespace ext {
+namespace oneapi {
 
 struct sub_group {
 
@@ -451,7 +452,7 @@ struct sub_group {
   /* --- deprecated collective functions --- */
   template <typename T>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::broadcast instead.")
+                    "sycl::ext::oneapi::broadcast instead.")
   EnableIfIsScalarArithmetic<T> broadcast(T x, id<1> local_id) const {
 #ifdef __SYCL_DEVICE_ONLY__
     return sycl::detail::spirv::GroupBroadcast<sub_group>(x, local_id);
@@ -465,7 +466,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::reduce instead.")
+                    "sycl::ext::oneapi::reduce instead.")
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
     return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
@@ -481,7 +482,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::reduce instead.")
+                    "sycl::ext::oneapi::reduce instead.")
   EnableIfIsScalarArithmetic<T> reduce(T x, T init, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
     return op(init, reduce(x, op));
@@ -496,7 +497,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::exclusive_scan instead.")
+                    "sycl::ext::oneapi::exclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
     return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
@@ -512,7 +513,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::exclusive_scan instead.")
+                    "sycl::ext::oneapi::exclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, T init,
                                                BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
@@ -535,7 +536,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::inclusive_scan instead.")
+                    "sycl::ext::oneapi::inclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
     return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
@@ -551,7 +552,7 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   __SYCL_DEPRECATED("Collectives in the sub-group class are deprecated. Use "
-                    "sycl::intel::inclusive_scan instead.")
+                    "sycl::ext::oneapi::inclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op,
                                                T init) const {
 #ifdef __SYCL_DEVICE_ONLY__
@@ -572,6 +573,7 @@ struct sub_group {
   template <int dimensions> friend class cl::sycl::nd_item;
   sub_group() = default;
 };
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp b/sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp
deleted file mode 100644
index d712fccf0d956..0000000000000
--- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp
+++ /dev/null
@@ -1,663 +0,0 @@
-//==------------ esimd_memory_intrin.hpp - DPC++ Explicit SIMD API ---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Declares Explicit SIMD intrinsics used to implement working with
-// the SIMD classes objects.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_util.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
-#include <cstdint>
-
-// flat_read does flat-address gather
-template <typename Ty, int N, int NumBlk = 0,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<
-    Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
-__esimd_flat_read(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                  int ElemsPerAddr = NumBlk,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// flat_write does flat-address scatter
-template <typename Ty, int N, int NumBlk = 0,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL void
-__esimd_flat_write(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                   sycl::intel::gpu::vector_type_t<
-                       Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
-                       vals,
-                   int ElemsPerAddr = NumBlk,
-                   sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// flat_block_read reads a block of data from one flat address
-template <typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_block_read_unaligned(uint64_t addr);
-
-// flat_block_write writes a block of data using one flat address
-template <typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL void
-__esimd_flat_block_write(uint64_t addr,
-                         sycl::intel::gpu::vector_type_t<Ty, N> vals);
-
-// Reads a block of data from given surface at given offset.
-template <typename Ty, int N, typename SurfIndAliasTy>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset);
-
-// Writes given block of data to a surface with given index at given offset.
-template <typename Ty, int N, typename SurfIndAliasTy>
-SYCL_EXTERNAL void
-__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset,
-                    sycl::intel::gpu::vector_type_t<Ty, N> vals);
-
-// flat_read4 does flat-address gather4
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> SYCL_EXTERNAL
-__esimd_flat_read4(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                   sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// flat_write does flat-address scatter
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL void __esimd_flat_write4(
-    sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-    sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
-    sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// flat_atomic: flat-address atomic
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic0(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic1(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H = sycl::intel::gpu::CacheHint::None,
-          sycl::intel::gpu::CacheHint L3H = sycl::intel::gpu::CacheHint::None>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic2(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-// esimd_barrier, generic group barrier
-SYCL_EXTERNAL void __esimd_barrier();
-
-// slm_fence sets the SLM read/write order
-SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl);
-
-// slm_read does SLM gather
-template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_read(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                 sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// slm_write does SLM scatter
-template <typename Ty, int N>
-SYCL_EXTERNAL void
-__esimd_slm_write(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                  sycl::intel::gpu::vector_type_t<Ty, N> vals,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// slm_block_read reads a block of data from SLM
-template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_block_read(uint32_t addr);
-
-// slm_block_write writes a block of data to SLM
-template <typename Ty, int N>
-SYCL_EXTERNAL void
-__esimd_slm_block_write(uint32_t addr,
-                        sycl::intel::gpu::vector_type_t<Ty, N> vals);
-
-// slm_read4 does SLM gather4
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
-__esimd_slm_read4(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// slm_write4 does SLM scatter4
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask>
-SYCL_EXTERNAL void __esimd_slm_write4(
-    sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-    sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
-    sycl::intel::gpu::vector_type_t<uint16_t, N> pred = 1);
-
-// slm_atomic: SLM atomic
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic0(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic1(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic2(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred);
-
-// Media block load
-//
-// @param Ty the element data type.
-//
-// @param M the hight of the 2D block.
-//
-// @param N the width of the 2D block.
-//
-// @param TACC type of the surface handle.
-//
-// @param modifier top/bottom field surface access control.
-//
-// @param handle the surface handle.
-//
-// @param plane planar surface index.
-//
-// @param width the width of the return block.
-//
-// @param x X-coordinate of the left upper rectangle corner in BYTES.
-//
-// @param y Y-coordinate of the left upper rectangle corner in ROWS.
-//
-// @return the linearized 2D block data read from surface.
-//
-template <typename Ty, int M, int N, typename TACC>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, M * N>
-__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane,
-                         unsigned width, unsigned x, unsigned y);
-
-// Media block store
-//
-// @param Ty the element data type.
-//
-// @param M the hight of the 2D block.
-//
-// @param N the width of the 2D block.
-//
-// @param TACC type of the surface handle.
-//
-// @param modifier top/bottom field surface access control.
-//
-// @param handle the surface handle.
-//
-// @param plane planar surface index.
-//
-// @param width the width of the return block.
-//
-// @param x X-coordinate of the left upper rectangle corner in BYTES.
-//
-// @param y Y-coordinate of the left upper rectangle corner in ROWS.
-//
-// @param vals the linearized 2D block data to be written to surface.
-//
-template <typename Ty, int M, int N, typename TACC>
-SYCL_EXTERNAL void
-__esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane,
-                          unsigned width, unsigned x, unsigned y,
-                          sycl::intel::gpu::vector_type_t<Ty, M * N> vals);
-
-#ifndef __SYCL_DEVICE_ONLY__
-
-template <typename Ty, int N, int NumBlk, sycl::intel::gpu::CacheHint L1H,
-          sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<
-    Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
-__esimd_flat_read(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                  int ElemsPerAddr,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  auto NumBlkDecoded = sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk);
-  sycl::intel::gpu::vector_type_t<
-      Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
-      V;
-  ElemsPerAddr = sycl::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr);
-
-  for (int I = 0; I < N; I++) {
-    if (pred[I]) {
-      Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
-      if (sizeof(Ty) == 2)
-        ElemsPerAddr = ElemsPerAddr / 2;
-      if (sizeof(Ty) <= 2) {
-        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
-          V[I * NumBlkDecoded + J] = *(Addr + J);
-      } else {
-        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
-          V[J * N + I] = *(Addr + J);
-      }
-    }
-  }
-  return V;
-}
-
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask,
-          sycl::intel::gpu::CacheHint L1H, sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
-__esimd_flat_read4(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                   sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> V;
-  unsigned int Next = 0;
-
-  if constexpr (HasR(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
-        V[Next] = *Addr;
-      }
-    }
-  }
-
-  if constexpr (HasG(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty));
-        V[Next] = *Addr;
-      }
-    }
-  }
-
-  if constexpr (HasB(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty));
-        V[Next] = *Addr;
-      }
-    }
-  }
-
-  if constexpr (HasA(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty) +
-                                          sizeof(Ty));
-        V[Next] = *Addr;
-      }
-    }
-  }
-
-  return V;
-}
-
-template <typename Ty, int N, int NumBlk, sycl::intel::gpu::CacheHint L1H,
-          sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL void
-__esimd_flat_write(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                   sycl::intel::gpu::vector_type_t<
-                       Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)>
-                       vals,
-                   int ElemsPerAddr,
-                   sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  auto NumBlkDecoded = sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk);
-  ElemsPerAddr = sycl::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr);
-
-  for (int I = 0; I < N; I++) {
-    if (pred[I]) {
-      Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
-      if (sizeof(Ty) == 2)
-        ElemsPerAddr = ElemsPerAddr / 2;
-      if (sizeof(Ty) <= 2) {
-        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
-          *(Addr + J) = vals[I * NumBlkDecoded + J];
-      } else {
-        for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
-          *(Addr + J) = vals[J * N + I];
-      }
-    }
-  }
-}
-
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask,
-          sycl::intel::gpu::CacheHint L1H, sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL void __esimd_flat_write4(
-    sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-    sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
-    sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> V;
-  unsigned int Next = 0;
-
-  if constexpr (HasR(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
-        *Addr = vals[Next];
-      }
-    }
-  }
-
-  if constexpr (HasG(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty));
-        *Addr = vals[Next];
-      }
-    }
-  }
-
-  if constexpr (HasB(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty));
-        *Addr = vals[Next];
-      }
-    }
-  }
-
-  if constexpr (HasA(Mask)) {
-    for (int I = 0; I < N; I++, Next++) {
-      if (pred[I]) {
-        Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + sizeof(Ty) + sizeof(Ty) +
-                                          sizeof(Ty));
-        *Addr = vals[Next];
-      }
-    }
-  }
-}
-
-template <typename Ty, int N, sycl::intel::gpu::CacheHint L1H,
-          sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_block_read_unaligned(uint64_t addr) {
-  sycl::intel::gpu::vector_type_t<Ty, N> V;
-
-  for (int I = 0; I < N; I++) {
-    Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
-    V[I] = *Addr;
-  }
-  return V;
-}
-
-template <typename Ty, int N, sycl::intel::gpu::CacheHint L1H,
-          sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL void
-__esimd_flat_block_write(uint64_t addr,
-                         sycl::intel::gpu::vector_type_t<Ty, N> vals) {
-  for (int I = 0; I < N; I++) {
-    Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
-    *Addr = vals[I];
-  }
-}
-
-template <typename Ty, int M, int N, typename TACC>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, M * N>
-__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane,
-                         unsigned width, unsigned x, unsigned y) {
-  // On host the input surface is modeled as sycl image 2d object,
-  // and the read/write access is done through accessor,
-  // which is passed in as the handle argument.
-  auto range = sycl::intel::gpu::AccessorPrivateProxy::getImageRange(handle);
-  unsigned bpp = sycl::intel::gpu::AccessorPrivateProxy::getElemSize(handle);
-  unsigned vpp = bpp / sizeof(Ty);
-  unsigned int i = x / bpp;
-  unsigned int j = y;
-
-  assert(x % bpp == 0);
-  unsigned int xbound = range[0] - 1;
-  unsigned int ybound = range[1] - 1;
-
-  sycl::intel::gpu::vector_type_t<Ty, M * N> vals;
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col += vpp) {
-      unsigned int xoff = (i > xbound) ? xbound : i;
-      unsigned int yoff = (j > ybound) ? ybound : j;
-      auto coords = cl::sycl::cl_int2(xoff, yoff);
-      cl::sycl::cl_uint4 data = handle.read(coords);
-
-      sycl::intel::gpu::vector_type_t<unsigned int, 4> res;
-      for (int idx = 0; idx < 4; idx++) {
-        res[idx] = data[idx];
-      }
-
-      constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
-      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
-      using refTy = sycl::intel::gpu::vector_type_t<Ty, refN>;
-      auto ref = reinterpret_cast<refTy>(res);
-
-      unsigned int offset1 = col + row * N;
-      unsigned int offset2 = 0;
-      for (int idx = 0; idx < vpp; idx++) {
-        vals[offset1] = ref[offset2];
-        offset1++;
-        offset2 += stride;
-      }
-      i++;
-    }
-    i = x / bpp;
-    j++;
-  }
-
-  return vals;
-}
-
-template <typename Ty, int M, int N, typename TACC>
-SYCL_EXTERNAL void
-__esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane,
-                          unsigned width, unsigned x, unsigned y,
-                          sycl::intel::gpu::vector_type_t<Ty, M * N> vals) {
-  unsigned bpp = sycl::intel::gpu::AccessorPrivateProxy::getElemSize(handle);
-  unsigned vpp = bpp / sizeof(Ty);
-  auto range = sycl::intel::gpu::AccessorPrivateProxy::getImageRange(handle);
-  unsigned int i = x / bpp;
-  unsigned int j = y;
-
-  assert(x % bpp == 0);
-
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col += vpp) {
-      constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
-      sycl::intel::gpu::vector_type_t<Ty, Sz> res = 0;
-
-      unsigned int offset1 = col + row * N;
-      unsigned int offset2 = 0;
-      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
-      for (int idx = 0; idx < vpp; idx++) {
-        res[offset2] = vals[offset1];
-        offset1++;
-        offset2 += stride;
-      }
-
-      using refTy = sycl::intel::gpu::vector_type_t<unsigned int, 4>;
-      auto ref = reinterpret_cast<refTy>(res);
-
-      cl::sycl::cl_uint4 data;
-      for (int idx = 0; idx < 4; idx++) {
-        data[idx] = ref[idx];
-      }
-
-      if (i < range[0] && j < range[1]) {
-        auto coords = cl::sycl::cl_int2(i, j);
-        handle.write(coords, data);
-      }
-      i++;
-    }
-    i = x / bpp;
-    j++;
-  }
-}
-
-template <typename Ty, int N>
-SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t<Ty, N> src) {
-  for (unsigned int i = 0; i != N; i++) {
-    if (src[i] != 0)
-      return 1;
-  }
-  return 0;
-}
-
-template <typename Ty, int N>
-SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t<Ty, N> src) {
-  for (unsigned int i = 0; i != N; i++) {
-    if (src[i] == 0)
-      return 0;
-  }
-  return 1;
-}
-
-template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_dp4(sycl::intel::gpu::vector_type_t<Ty, N> v1,
-            sycl::intel::gpu::vector_type_t<Ty, N> v2) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  for (auto i = 0; i != N; i += 4) {
-    Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) +
-            (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]);
-    retv[i] = dp;
-    retv[i + 1] = dp;
-    retv[i + 2] = dp;
-    retv[i + 3] = dp;
-  }
-  return retv;
-}
-
-/// TODO
-SYCL_EXTERNAL void __esimd_barrier() {}
-
-SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl) {}
-
-template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_read(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                 sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-// slm_write does SLM scatter
-template <typename Ty, int N>
-SYCL_EXTERNAL void
-__esimd_slm_write(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                  sycl::intel::gpu::vector_type_t<Ty, N> vals,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {}
-
-// slm_block_read reads a block of data from SLM
-template <typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_block_read(uint32_t addr) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-// slm_block_write writes a block of data to SLM
-template <typename Ty, int N>
-SYCL_EXTERNAL void
-__esimd_slm_block_write(uint32_t addr,
-                        sycl::intel::gpu::vector_type_t<Ty, N> vals) {}
-
-// slm_read4 does SLM gather4
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)>
-__esimd_slm_read4(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                  sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> retv;
-  return retv;
-}
-
-// slm_write4 does SLM scatter4
-template <typename Ty, int N, sycl::intel::gpu::ChannelMaskType Mask>
-SYCL_EXTERNAL void __esimd_slm_write4(
-    sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-    sycl::intel::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
-    sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {}
-
-// slm_atomic: SLM atomic
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic0(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic1(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_slm_atomic2(sycl::intel::gpu::vector_type_t<uint32_t, N> addrs,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                    sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                    sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H, sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic0(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H, sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic1(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <sycl::intel::gpu::EsimdAtomicOpType Op, typename Ty, int N,
-          sycl::intel::gpu::CacheHint L1H, sycl::intel::gpu::CacheHint L3H>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_flat_atomic2(sycl::intel::gpu::vector_type_t<uint64_t, N> addrs,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src0,
-                     sycl::intel::gpu::vector_type_t<Ty, N> src1,
-                     sycl::intel::gpu::vector_type_t<uint16_t, N> pred) {
-  sycl::intel::gpu::vector_type_t<Ty, N> retv;
-  return retv;
-}
-
-template <typename Ty, int N, typename SurfIndAliasTy>
-SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<Ty, N>
-__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) {
-  throw cl::sycl::feature_not_supported();
-  return sycl::intel::gpu::vector_type_t<Ty, N>();
-}
-
-template <typename Ty, int N, typename SurfIndAliasTy>
-SYCL_EXTERNAL void
-__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset,
-                    sycl::intel::gpu::vector_type_t<Ty, N> vals) {
-
-  throw cl::sycl::feature_not_supported();
-}
-
-#endif // __SYCL_DEVICE_ONLY__

From cdfec56196944baaf195f8475ee6ab385b0e6247 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Tue, 21 Jul 2020 16:13:30 -0400
Subject: [PATCH 02/13] Fix comp fails and update tests to use new namespaces

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/include/CL/sycl.hpp                      |  12 +--
 sycl/include/CL/sycl/accessor.hpp             |   6 +-
 sycl/include/CL/sycl/builtins.hpp             |   6 +-
 sycl/include/CL/sycl/detail/accessor_impl.hpp |   4 +-
 sycl/include/CL/sycl/detail/spirv.hpp         | 101 ++++++++++--------
 sycl/include/CL/sycl/detail/type_traits.hpp   |   8 +-
 .../ext/intel/esimd/detail/esimd_intrin.hpp   |   2 +
 .../ext/intel/esimd/detail/esimd_util.hpp     |  32 +++---
 .../include/CL/sycl/ext/intel/esimd/esimd.hpp |   2 +-
 .../CL/sycl/ext/intel/esimd/esimd_math.hpp    |  12 ++-
 .../CL/sycl/ext/intel/esimd/esimd_memory.hpp  |   2 +-
 .../CL/sycl/ext/intel/esimd/esimd_view.hpp    |   2 +-
 sycl/include/CL/sycl/ext/intel/fpga_reg.hpp   |   2 +-
 .../include/CL/sycl/ext/oneapi/atomic_ref.hpp |   8 +-
 .../CL/sycl/ext/oneapi/group_algorithm.hpp    |  67 ++++++------
 sycl/include/CL/sycl/ext/oneapi/reduction.hpp |   1 +
 sycl/include/CL/sycl/handler.hpp              |  32 +++---
 sycl/include/CL/sycl/nd_item.hpp              |   6 +-
 sycl/include/CL/sycl/pipes.hpp                |   4 +-
 sycl/include/CL/sycl/program.hpp              |   8 +-
 sycl/source/detail/program_impl.cpp           |   4 +-
 .../program_manager/program_manager.cpp       |   4 +-
 sycl/source/function_pointer.cpp              |   6 +-
 sycl/test/atomic_ref/add.cpp                  |  10 +-
 sycl/test/atomic_ref/compare_exchange.cpp     |   4 +-
 sycl/test/atomic_ref/exchange.cpp             |   4 +-
 sycl/test/atomic_ref/load.cpp                 |   4 +-
 sycl/test/atomic_ref/max.cpp                  |   4 +-
 sycl/test/atomic_ref/min.cpp                  |   4 +-
 sycl/test/atomic_ref/store.cpp                |   4 +-
 sycl/test/atomic_ref/sub.cpp                  |  10 +-
 .../basic_tests/esimd/block_load_store.cpp    |   4 +-
 sycl/test/basic_tests/esimd/esimd_math.cpp    |   4 +-
 sycl/test/basic_tests/esimd/flat_atomic.cpp   |   4 +-
 .../basic_tests/esimd/gather4_scatter4.cpp    |   4 +-
 .../test/basic_tests/esimd/gather_scatter.cpp |   4 +-
 sycl/test/basic_tests/esimd/global_var.cpp    |   2 +-
 sycl/test/basic_tests/esimd/simd.cpp          |   4 +-
 sycl/test/basic_tests/esimd/simd_merge.cpp    |   4 +-
 sycl/test/basic_tests/esimd/simd_view.cpp     |   4 +-
 sycl/test/basic_tests/esimd/slm_atomic.cpp    |   4 +-
 sycl/test/basic_tests/esimd/slm_block.cpp     |   4 +-
 sycl/test/basic_tests/esimd/slm_load.cpp      |   4 +-
 sycl/test/basic_tests/esimd/slm_load4.cpp     |   4 +-
 sycl/test/built-ins/printf.cpp                |  22 ++--
 sycl/test/built-ins/scalar_integer.cpp        |   2 +-
 sycl/test/built-ins/vector_integer.cpp        |   2 +-
 .../function-pointers/fp-as-kernel-arg.cpp    |   4 +-
 .../pass-fp-through-buffer.cpp                |   8 +-
 sycl/test/group-algorithm/all_of.cpp          |   2 +-
 sycl/test/group-algorithm/any_of.cpp          |   2 +-
 sycl/test/group-algorithm/broadcast.cpp       |   2 +-
 sycl/test/group-algorithm/exclusive_scan.cpp  |   2 +-
 sycl/test/group-algorithm/inclusive_scan.cpp  |   2 +-
 sycl/test/group-algorithm/leader.cpp          |   2 +-
 sycl/test/group-algorithm/none_of.cpp         |   2 +-
 sycl/test/group-algorithm/reduce.cpp          |   2 +-
 sycl/test/linear_id/linear-sub_group.cpp      |   2 +-
 sycl/test/reduction/reduction_ctor.cpp        |  22 ++--
 .../reduction/reduction_nd_conditional.cpp    |  10 +-
 sycl/test/reduction/reduction_nd_ext_type.hpp |  14 +--
 sycl/test/reduction/reduction_nd_s0_dw.cpp    |  40 +++----
 sycl/test/reduction/reduction_nd_s0_rw.cpp    |  40 +++----
 sycl/test/reduction/reduction_nd_s1_dw.cpp    |  40 +++----
 sycl/test/reduction/reduction_nd_s1_rw.cpp    |  40 +++----
 sycl/test/reduction/reduction_placeholder.cpp |  14 +--
 sycl/test/reduction/reduction_transparent.cpp |   8 +-
 sycl/test/reduction/reduction_usm.cpp         |  16 +--
 .../regression/sub-group-store-const-ref.cpp  |   2 +-
 sycl/test/spec_const/spec_const_hw.cpp        |   6 +-
 sycl/test/spec_const/spec_const_neg.cpp       |   6 +-
 sycl/test/spec_const/spec_const_redefine.cpp  |   4 +-
 sycl/test/spec_const/spec_const_types.cpp     |  24 ++---
 sycl/test/sub_group/attributes.cpp            |  17 +--
 sycl/test/sub_group/barrier.cpp               |   8 +-
 sycl/test/sub_group/broadcast.hpp             |   2 +-
 sycl/test/sub_group/common.cpp                |   2 +-
 sycl/test/sub_group/common_ocl.cpp            |   2 +-
 sycl/test/sub_group/generic-shuffle.cpp       |   4 +-
 sycl/test/sub_group/helper.hpp                |  28 +++--
 sycl/test/sub_group/load_store.cpp            |  13 ++-
 sycl/test/sub_group/reduce.hpp                |  26 ++---
 sycl/test/sub_group/scan.hpp                  |  34 +++---
 sycl/test/sub_group/shuffle.hpp               |   4 +-
 sycl/test/sub_group/vote.cpp                  |   2 +-
 85 files changed, 475 insertions(+), 434 deletions(-)
 mode change 100755 => 100644 sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp

diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index 4fd3c55b0952c..c5c20813d014a 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -18,16 +18,16 @@
 #include <CL/sycl/device_selector.hpp>
 #include <CL/sycl/event.hpp>
 #include <CL/sycl/exception.hpp>
+#include <CL/sycl/ext/oneapi/atomic.hpp>
+#include <CL/sycl/ext/oneapi/builtins.hpp>
+#include <CL/sycl/ext/oneapi/function_pointer.hpp>
+#include <CL/sycl/ext/oneapi/group_algorithm.hpp>
+#include <CL/sycl/ext/oneapi/reduction.hpp>
+#include <CL/sycl/ext/oneapi/sub_group.hpp>
 #include <CL/sycl/group.hpp>
 #include <CL/sycl/handler.hpp>
 #include <CL/sycl/id.hpp>
 #include <CL/sycl/image.hpp>
-#include <CL/sycl/intel/atomic.hpp>
-#include <CL/sycl/intel/builtins.hpp>
-#include <CL/sycl/intel/function_pointer.hpp>
-#include <CL/sycl/intel/group_algorithm.hpp>
-#include <CL/sycl/intel/reduction.hpp>
-#include <CL/sycl/intel/sub_group.hpp>
 #include <CL/sycl/item.hpp>
 #include <CL/sycl/kernel.hpp>
 #include <CL/sycl/multi_ptr.hpp>
diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
index 7b26a13f475e7..6a4fc49e72916 100755
--- a/sycl/include/CL/sycl/accessor.hpp
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -197,12 +197,14 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 // Forward declare a "back-door" access class to support ESIMD.
 class AccessorPrivateProxy;
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
@@ -431,7 +433,7 @@ class image_accessor
 #endif
 
 private:
-  friend class sycl::intel::gpu::AccessorPrivateProxy;
+  friend class sycl::ext::intel::gpu::AccessorPrivateProxy;
 
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
   const OCLImageTy getNativeImageObj() const { return MImageObj; }
@@ -881,7 +883,7 @@ class accessor :
 #endif // __SYCL_DEVICE_ONLY__
 
 private:
-  friend class sycl::intel::gpu::AccessorPrivateProxy;
+  friend class sycl::ext::intel::gpu::AccessorPrivateProxy;
 
 public:
   using value_type = DataT;
diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index 9671987643f41..fbf01679463bf 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -724,14 +724,16 @@ detail::enable_if_t<detail::is_geninteger<T>::value, T> clz(T x) __NOEXC {
   return __sycl_std::__invoke_clz<T>(x);
 }
 
-namespace intel {
+namespace ext {
+namespace oneapi {
 // geninteger ctz (geninteger x)
 template <typename T>
 sycl::detail::enable_if_t<sycl::detail::is_geninteger<T>::value, T>
 ctz(T x) __NOEXC {
   return __sycl_std::__invoke_ctz<T>(x);
 }
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 
 // geninteger mad_hi (geninteger a, geninteger b, geninteger c)
 template <typename T>
diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp
index 76676014975c2..c5913a6cf5272 100644
--- a/sycl/include/CL/sycl/detail/accessor_impl.hpp
+++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp
@@ -17,12 +17,14 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 // Forward declare a "back-door" access class to support ESIMD.
 class AccessorPrivateProxy;
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
@@ -170,7 +172,7 @@ class AccessorBaseHost {
   AccessorImplPtr impl;
 
 private:
-  friend class sycl::intel::gpu::AccessorPrivateProxy;
+  friend class sycl::ext::intel::gpu::AccessorPrivateProxy;
 };
 
 class __SYCL_EXPORT LocalAccessorImplHost {
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
index d662e2afc7880..989cf6a3a096a 100644
--- a/sycl/include/CL/sycl/detail/spirv.hpp
+++ b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -12,14 +12,16 @@
 #include <CL/__spirv/spirv_vars.hpp>
 #include <CL/sycl/detail/generic_type_traits.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
-#include <CL/sycl/intel/atomic_enums.hpp>
+#include <CL/sycl/ext/oneapi/atomic_enums.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-namespace intel {
+namespace ext {
+namespace oneapi {
 struct sub_group;
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 namespace detail {
 namespace spirv {
 
@@ -29,7 +31,7 @@ template <int Dimensions> struct group_scope<group<Dimensions>> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
 };
 
-template <> struct group_scope<::cl::sycl::intel::sub_group> {
+template <> struct group_scope<::cl::sycl::ext::oneapi::sub_group> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
 };
 
@@ -87,23 +89,23 @@ T GroupBroadcast(T x, id<Dimensions> local_id) {
 // Single happens-before means semantics should always apply to all spaces
 // Although consume is unsupported, forwarding to acquire is valid
 static inline constexpr __spv::MemorySemanticsMask::Flag
-getMemorySemanticsMask(intel::memory_order Order) {
+getMemorySemanticsMask(ext::oneapi::memory_order Order) {
   __spv::MemorySemanticsMask::Flag SpvOrder = __spv::MemorySemanticsMask::None;
   switch (Order) {
-  case intel::memory_order::relaxed:
+  case ext::oneapi::memory_order::relaxed:
     SpvOrder = __spv::MemorySemanticsMask::None;
     break;
-  case intel::memory_order::__consume_unsupported:
-  case intel::memory_order::acquire:
+  case ext::oneapi::memory_order::__consume_unsupported:
+  case ext::oneapi::memory_order::acquire:
     SpvOrder = __spv::MemorySemanticsMask::Acquire;
     break;
-  case intel::memory_order::release:
+  case ext::oneapi::memory_order::release:
     SpvOrder = __spv::MemorySemanticsMask::Release;
     break;
-  case intel::memory_order::acq_rel:
+  case ext::oneapi::memory_order::acq_rel:
     SpvOrder = __spv::MemorySemanticsMask::AcquireRelease;
     break;
-  case intel::memory_order::seq_cst:
+  case ext::oneapi::memory_order::seq_cst:
     SpvOrder = __spv::MemorySemanticsMask::SequentiallyConsistent;
     break;
   }
@@ -113,17 +115,18 @@ getMemorySemanticsMask(intel::memory_order Order) {
       __spv::MemorySemanticsMask::CrossWorkgroupMemory);
 }
 
-static inline constexpr __spv::Scope::Flag getScope(intel::memory_scope Scope) {
+static inline constexpr __spv::Scope::Flag
+getScope(ext::oneapi::memory_scope Scope) {
   switch (Scope) {
-  case intel::memory_scope::work_item:
+  case ext::oneapi::memory_scope::work_item:
     return __spv::Scope::Invocation;
-  case intel::memory_scope::sub_group:
+  case ext::oneapi::memory_scope::sub_group:
     return __spv::Scope::Subgroup;
-  case intel::memory_scope::work_group:
+  case ext::oneapi::memory_scope::work_group:
     return __spv::Scope::Workgroup;
-  case intel::memory_scope::device:
+  case ext::oneapi::memory_scope::device:
     return __spv::Scope::Device;
-  case intel::memory_scope::system:
+  case ext::oneapi::memory_scope::system:
     return __spv::Scope::CrossDevice;
   }
 }
@@ -131,8 +134,10 @@ static inline constexpr __spv::Scope::Flag getScope(intel::memory_scope Scope) {
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
 AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
-                      intel::memory_scope Scope, intel::memory_order Success,
-                      intel::memory_order Failure, T Desired, T Expected) {
+                      ext::oneapi::memory_scope Scope,
+                      ext::oneapi::memory_order Success,
+                      ext::oneapi::memory_order Failure, T Desired,
+                      T Expected) {
   auto SPIRVSuccess = getMemorySemanticsMask(Success);
   auto SPIRVFailure = getMemorySemanticsMask(Failure);
   auto SPIRVScope = getScope(Scope);
@@ -144,8 +149,10 @@ AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
 AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
-                      intel::memory_scope Scope, intel::memory_order Success,
-                      intel::memory_order Failure, T Desired, T Expected) {
+                      ext::oneapi::memory_scope Scope,
+                      ext::oneapi::memory_order Success,
+                      ext::oneapi::memory_order Failure, T Desired,
+                      T Expected) {
   using I = detail::make_unsinged_integer_t<T>;
   auto SPIRVSuccess = getMemorySemanticsMask(Success);
   auto SPIRVFailure = getMemorySemanticsMask(Failure);
@@ -162,8 +169,8 @@ AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-           intel::memory_order Order) {
+AtomicLoad(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+           ext::oneapi::memory_order Order) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -172,8 +179,8 @@ AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
-AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-           intel::memory_order Order) {
+AtomicLoad(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+           ext::oneapi::memory_order Order) {
   using I = detail::make_unsinged_integer_t<T>;
   auto *PtrInt =
       reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
@@ -186,8 +193,8 @@ AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value>
-AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-            intel::memory_order Order, T Value) {
+AtomicStore(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+            ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -196,8 +203,8 @@ AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_floating_point<T>::value>
-AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-            intel::memory_order Order, T Value) {
+AtomicStore(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+            ext::oneapi::memory_order Order, T Value) {
   using I = detail::make_unsinged_integer_t<T>;
   auto *PtrInt =
       reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
@@ -210,8 +217,8 @@ AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-               intel::memory_order Order, T Value) {
+AtomicExchange(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+               ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -220,8 +227,8 @@ AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
-AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-               intel::memory_order Order, T Value) {
+AtomicExchange(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+               ext::oneapi::memory_order Order, T Value) {
   using I = detail::make_unsinged_integer_t<T>;
   auto *PtrInt =
       reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
@@ -236,8 +243,8 @@ AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicIAdd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-           intel::memory_order Order, T Value) {
+AtomicIAdd(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+           ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -246,8 +253,8 @@ AtomicIAdd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicISub(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-           intel::memory_order Order, T Value) {
+AtomicISub(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+           ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -256,8 +263,8 @@ AtomicISub(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicAnd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-          intel::memory_order Order, T Value) {
+AtomicAnd(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+          ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -266,8 +273,8 @@ AtomicAnd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicOr(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-         intel::memory_order Order, T Value) {
+AtomicOr(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+         ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -276,8 +283,8 @@ AtomicOr(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicXor(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-          intel::memory_order Order, T Value) {
+AtomicXor(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+          ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -286,8 +293,8 @@ AtomicXor(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicMin(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-          intel::memory_order Order, T Value) {
+AtomicMin(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+          ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
@@ -296,8 +303,8 @@ AtomicMin(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
 
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
-AtomicMax(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
-          intel::memory_order Order, T Value) {
+AtomicMax(multi_ptr<T, AddressSpace> MPtr, ext::oneapi::memory_scope Scope,
+          ext::oneapi::memory_order Order, T Value) {
   auto *Ptr = MPtr.get();
   auto SPIRVOrder = getMemorySemanticsMask(Order);
   auto SPIRVScope = getScope(Scope);
diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp
index 3f52acc8a2de2..5c90569490924 100644
--- a/sycl/include/CL/sycl/detail/type_traits.hpp
+++ b/sycl/include/CL/sycl/detail/type_traits.hpp
@@ -18,9 +18,11 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 template <int Dimensions> class group;
-namespace intel {
+namespace ext {
+namespace oneapi {
 struct sub_group;
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 namespace detail {
 namespace half_impl {
 class half;
@@ -313,7 +315,7 @@ struct is_group<group<Dimensions>> : std::true_type {};
 
 template <typename T> struct is_sub_group : std::false_type {};
 
-template <> struct is_sub_group<intel::sub_group> : std::true_type {};
+template <> struct is_sub_group<ext::oneapi::sub_group> : std::true_type {};
 
 template <typename T>
 struct is_generic_group
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
index fdaca49bf6e30..e1c19df0b9076 100644
--- a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp
@@ -121,6 +121,7 @@ __esimd_wrregion(sycl::ext::intel::gpu::vector_type_t<T, N> OldVal,
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 // TODO dependencies on the std SYCL concepts like images
@@ -211,6 +212,7 @@ readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
old mode 100755
new mode 100644
index e0b7323a8bafe..b343da5a3666a
--- a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp
@@ -57,11 +57,11 @@ template <typename T> struct is_esimd_vector {
   static const bool value = false;
 };
 template <typename T, int N>
-struct is_esimd_vector<sycl::intel::gpu::simd<T, N>> {
+struct is_esimd_vector<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = true;
 };
 template <typename T, int N>
-struct is_esimd_vector<sycl::intel::gpu::vector_type<T, N>> {
+struct is_esimd_vector<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = true;
 };
 
@@ -79,12 +79,12 @@ struct is_dword_type
 };
 
 template <typename T, int N>
-struct is_dword_type<sycl::intel::gpu::vector_type<T, N>> {
+struct is_dword_type<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = is_dword_type<T>::value;
 };
 
 template <typename T, int N>
-struct is_dword_type<sycl::intel::gpu::simd<T, N>> {
+struct is_dword_type<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = is_dword_type<T>::value;
 };
 
@@ -97,11 +97,12 @@ struct is_word_type
                            typename std::remove_const<T>::type>::value> {};
 
 template <typename T, int N>
-struct is_word_type<sycl::intel::gpu::vector_type<T, N>> {
+struct is_word_type<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = is_word_type<T>::value;
 };
 
-template <typename T, int N> struct is_word_type<sycl::intel::gpu::simd<T, N>> {
+template <typename T, int N>
+struct is_word_type<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = is_word_type<T>::value;
 };
 
@@ -114,11 +115,12 @@ struct is_byte_type
                            typename std::remove_const<T>::type>::value> {};
 
 template <typename T, int N>
-struct is_byte_type<sycl::intel::gpu::vector_type<T, N>> {
+struct is_byte_type<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = is_byte_type<T>::value;
 };
 
-template <typename T, int N> struct is_byte_type<sycl::intel::gpu::simd<T, N>> {
+template <typename T, int N>
+struct is_byte_type<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = is_byte_type<T>::value;
 };
 
@@ -152,33 +154,33 @@ struct is_qword_type
                            typename std::remove_const<T>::type>::value> {};
 
 template <typename T, int N>
-struct is_qword_type<sycl::intel::gpu::vector_type<T, N>> {
+struct is_qword_type<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = is_qword_type<T>::value;
 };
 
 template <typename T, int N>
-struct is_qword_type<sycl::intel::gpu::simd<T, N>> {
+struct is_qword_type<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = is_qword_type<T>::value;
 };
 
 // Extends to ESIMD vector types.
 template <typename T, int N>
-struct is_fp_or_dword_type<sycl::intel::gpu::vector_type<T, N>> {
+struct is_fp_or_dword_type<sycl::ext::intel::gpu::vector_type<T, N>> {
   static const bool value = is_fp_or_dword_type<T>::value;
 };
 
 template <typename T, int N>
-struct is_fp_or_dword_type<sycl::intel::gpu::simd<T, N>> {
+struct is_fp_or_dword_type<sycl::ext::intel::gpu::simd<T, N>> {
   static const bool value = is_fp_or_dword_type<T>::value;
 };
 
 /// Convert types into vector types
 template <typename T> struct simd_type {
-  using type = sycl::intel::gpu::simd<T, 1>;
+  using type = sycl::ext::intel::gpu::simd<T, 1>;
 };
 template <typename T, int N>
-struct simd_type<sycl::intel::gpu::vector_type<T, N>> {
-  using type = sycl::intel::gpu::simd<T, N>;
+struct simd_type<sycl::ext::intel::gpu::vector_type<T, N>> {
+  using type = sycl::ext::intel::gpu::simd<T, N>;
 };
 
 template <typename T> struct simd_type<T &> {
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
index c24cac7ad0b67..6b2bd52ff4ca5 100644
--- a/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp
@@ -453,7 +453,7 @@ ESIMD_INLINE simd<U, n> convert(simd<T, n> val) {
 #ifndef __SYCL_DEVICE_ONLY__
 template <typename Ty, int N>
 std::ostream &operator<<(std::ostream &OS,
-                         const sycl::intel::gpu::simd<Ty, N> &V) {
+                         const sycl::ext::intel::gpu::simd<Ty, N> &V) {
   OS << "{";
   for (int I = 0; I < N; I++) {
     OS << V[I];
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp
index 4eb145a315124..c04e70fee8a6c 100644
--- a/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp
@@ -10,15 +10,16 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
-#include <CL/sycl/intel/esimd/detail/esimd_util.hpp>
-#include <CL/sycl/intel/esimd/esimd.hpp>
-#include <CL/sycl/intel/esimd/esimd_enum.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_util.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
 #include <cstdint>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 namespace gpu {
 
@@ -1946,5 +1947,6 @@ simd<T, N> esimd_dp4(simd<T, N> v1, simd<T, N> v2) {
 
 } // namespace gpu
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
index 0dda839f89a16..67f6082e469a2 100644
--- a/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp
@@ -10,12 +10,12 @@
 
 #pragma once
 
-#include <CL/sycl/half_type.hpp>
 #include <CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp>
 #include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
 #include <CL/sycl/ext/intel/esimd/detail/esimd_util.hpp>
 #include <CL/sycl/ext/intel/esimd/esimd.hpp>
 #include <CL/sycl/ext/intel/esimd/esimd_enum.hpp>
+#include <CL/sycl/half_type.hpp>
 #include <cstdint>
 
 __SYCL_INLINE_NAMESPACE(cl) {
diff --git a/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
index abded4def0c25..7455b13ef509f 100644
--- a/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
+++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/ext/intel/esimd/detail/esimd_types.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
diff --git a/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
index f183d420ca2d0..a8c1b3605dce6 100644
--- a/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp
@@ -32,6 +32,6 @@ template <typename T> T fpga_reg(const T &t) {
 // Currently clang does not support nested namespace for attributes
 namespace intelfpga {
 template <typename T> T fpga_reg(const T &t) {
-  return cl::sycl::intel::fpga_reg(t);
+  return cl::sycl::ext::intel::fpga_reg(t);
 }
 }
diff --git a/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
index 833ae4fe8e924..19bfbefefb3a2 100644
--- a/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
@@ -128,16 +128,16 @@ class atomic_ref_base {
       detail::IsValidAtomicType<T>::value,
       "Invalid atomic type.  Valid types are arithmetic and pointer types");
   static_assert(!std::is_same<T, bool>::value,
-                "intel::atomic_ref does not support bool type");
+                "ext::oneapi::atomic_ref does not support bool type");
   static_assert(!(std::is_same<T, char>::value ||
                   std::is_same<T, signed char>::value ||
                   std::is_same<T, unsigned char>::value),
-                "intel::atomic_ref does not support char type");
+                "ext::oneapi::atomic_ref does not support char type");
   static_assert(!(std::is_same<T, short>::value ||
                   std::is_same<T, unsigned short>::value),
-                "intel::atomic_ref does not support short type");
+                "ext::oneapi::atomic_ref does not support short type");
   static_assert(!std::is_pointer<T>::value,
-                "intel::atomic_ref does not yet support pointer types");
+                "ext::oneapi::atomic_ref does not yet support pointer types");
   static_assert(detail::IsValidAtomicAddressSpace<AddressSpace>::value,
                 "Invalid atomic address_space.  Valid address spaces are: "
                 "global_space, local_space, global_device_space");
diff --git a/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
index 5dfa09e6418e5..abb8011de08a2 100644
--- a/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/group_algorithm.hpp
@@ -15,6 +15,7 @@
 #include <CL/sycl/ext/oneapi/functional.hpp>
 #include <CL/sycl/ext/oneapi/sub_group.hpp>
 #include <CL/sycl/group.hpp>
+#include <CL/sycl/nd_item.hpp>
 
 #ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -33,7 +34,7 @@ template <> inline size_t get_local_linear_range<group<3>>(group<3> g) {
 }
 template <>
 inline size_t
-get_local_linear_range<intel::sub_group>(ext::oneapi::sub_group g) {
+get_local_linear_range<ext::oneapi::sub_group>(ext::oneapi::sub_group g) {
   return g.get_local_range()[0];
 }
 
@@ -131,7 +132,7 @@ using EnableIfIsPointer =
 template <typename Group> bool all_of(Group, bool pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return sycl::detail::spirv::GroupAll<Group>(pred);
 #else
@@ -145,7 +146,7 @@ template <typename Group, typename T, class Predicate>
 bool all_of(Group g, T x, Predicate pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   return all_of(g, pred(x));
 }
 
@@ -154,7 +155,7 @@ EnableIfIsPointer<Ptr, bool> all_of(Group g, Ptr first, Ptr last,
                                     Predicate pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   bool partial = true;
   sycl::detail::for_each(
@@ -174,7 +175,7 @@ EnableIfIsPointer<Ptr, bool> all_of(Group g, Ptr first, Ptr last,
 template <typename Group> bool any_of(Group, bool pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return sycl::detail::spirv::GroupAny<Group>(pred);
 #else
@@ -188,7 +189,7 @@ template <typename Group, typename T, class Predicate>
 bool any_of(Group g, T x, Predicate pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   return any_of(g, pred(x));
 }
 
@@ -198,7 +199,7 @@ EnableIfIsPointer<Ptr, bool> any_of(Group g, Ptr first, Ptr last,
 #ifdef __SYCL_DEVICE_ONLY__
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   bool partial = false;
   sycl::detail::for_each(
       g, first, last,
@@ -217,7 +218,7 @@ EnableIfIsPointer<Ptr, bool> any_of(Group g, Ptr first, Ptr last,
 template <typename Group> bool none_of(Group, bool pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return sycl::detail::spirv::GroupAll<Group>(!pred);
 #else
@@ -231,7 +232,7 @@ template <typename Group, typename T, class Predicate>
 bool none_of(Group g, T x, Predicate pred) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   return none_of(g, pred(x));
 }
 
@@ -241,7 +242,7 @@ EnableIfIsPointer<Ptr, bool> none_of(Group g, Ptr first, Ptr last,
 #ifdef __SYCL_DEVICE_ONLY__
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   return !any_of(g, first, last, pred);
 #else
   (void)g;
@@ -258,7 +259,7 @@ EnableIfIsScalarArithmetic<T> broadcast(Group, T x,
                                         typename Group::id_type local_id) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
 #else
@@ -274,7 +275,7 @@ EnableIfIsVectorArithmetic<T> broadcast(Group g, T x,
                                         typename Group::id_type local_id) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   T result;
   for (int s = 0; s < x.get_size(); ++s) {
@@ -295,7 +296,7 @@ EnableIfIsScalarArithmetic<T>
 broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return broadcast(
       g, x,
@@ -314,7 +315,7 @@ EnableIfIsVectorArithmetic<T>
 broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   T result;
   for (int s = 0; s < x.get_size(); ++s) {
@@ -334,7 +335,7 @@ template <typename Group, typename T>
 EnableIfIsScalarArithmetic<T> broadcast(Group g, T x) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return broadcast(g, x, 0);
 #else
@@ -349,7 +350,7 @@ template <typename Group, typename T>
 EnableIfIsVectorArithmetic<T> broadcast(Group g, T x) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   T result;
   for (int s = 0; s < x.get_size(); ++s) {
@@ -368,7 +369,7 @@ template <typename Group, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> reduce(Group, T x, BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x, x)), T>::value ||
@@ -389,7 +390,7 @@ template <typename Group, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x[0], x[0])),
@@ -409,7 +410,7 @@ EnableIfIsScalarArithmetic<T> reduce(Group g, V x, T init,
                                      BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init, x)), T>::value ||
@@ -430,7 +431,7 @@ EnableIfIsVectorArithmetic<T> reduce(Group g, V x, T init,
                                      BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init[0], x[0])),
@@ -456,7 +457,7 @@ EnableIfIsPointer<Ptr, typename Ptr::element_type>
 reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(*first, *first)),
@@ -487,7 +488,7 @@ EnableIfIsPointer<Ptr, T> reduce(Group g, Ptr first, Ptr last, T init,
                                  BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init, *first)), T>::value ||
@@ -515,7 +516,7 @@ EnableIfIsScalarArithmetic<T> exclusive_scan(Group, T x,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
@@ -536,7 +537,7 @@ EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x[0], x[0])),
@@ -556,7 +557,7 @@ EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, V x, T init,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init[0], x[0])),
@@ -576,7 +577,7 @@ EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, V x, T init,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(init, x)), T>::value ||
                     (std::is_same<T, half>::value &&
@@ -607,7 +608,7 @@ exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init,
                BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(*first, *first)), T>::value ||
@@ -669,7 +670,7 @@ EnableIfIsVectorArithmetic<T> inclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x[0], x[0])),
@@ -689,7 +690,7 @@ EnableIfIsScalarArithmetic<T> inclusive_scan(Group, T x,
                                              BinaryOperation binary_op) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
@@ -710,7 +711,7 @@ EnableIfIsScalarArithmetic<T>
 inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(init, x)), T>::value ||
                     (std::is_same<T, half>::value &&
@@ -733,7 +734,7 @@ EnableIfIsVectorArithmetic<T>
 inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init[0], x[0])), T>::value ||
@@ -754,7 +755,7 @@ inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                BinaryOperation binary_op, T init) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(init, *first)), T>::value ||
@@ -812,7 +813,7 @@ EnableIfIsPointer<InPtr, OutPtr> inclusive_scan(Group g, InPtr first,
 template <typename Group> bool leader(Group g) {
   static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
-                "intel::sub_group class.");
+                "ext::oneapi::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   typename Group::linear_id_type linear_id =
       sycl::detail::get_local_linear_id(g);
diff --git a/sycl/include/CL/sycl/ext/oneapi/reduction.hpp b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
index 79f0f66677aa4..42d0109b55468 100644
--- a/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
+#include <CL/sycl/handler.hpp>
 #include <CL/sycl/ext/oneapi/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 0a40efb33bd2e..80f0b38deb736 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -172,7 +172,8 @@ checkValueRange(const T &V) {
 
 } // namespace detail
 
-namespace intel {
+namespace ext {
+namespace oneapi {
 namespace detail {
 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
           access::mode AccMode, access::placeholder IsPlaceholder>
@@ -196,7 +197,8 @@ enable_if_t<!Reduction::has_fast_atomics>
 reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
               Reduction &Redu);
 } // namespace detail
-} // namespace intel
+} // namespace oneapi
+} // namespace ext
 
 /// Command group handler class.
 ///
@@ -339,7 +341,7 @@ class __SYCL_EXPORT handler {
   // Recursively calls itself until arguments pack is fully processed.
   // The version for regular(standard layout) argument.
   template <typename T, typename... Ts>
-  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
+  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
     set_arg(ArgIndex, std::move(Arg));
     setArgsHelper(++ArgIndex, std::move(Args)...);
   }
@@ -806,7 +808,7 @@ class __SYCL_EXPORT handler {
   /// Registers pack of arguments(Args) with indexes starting from 0.
   ///
   /// \param Args are argument values to be set.
-  template <typename... Ts> void set_args(Ts &&... Args) {
+  template <typename... Ts> void set_args(Ts &&...Args) {
     setArgsHelper(0, std::move(Args)...);
   }
 
@@ -968,8 +970,8 @@ class __SYCL_EXPORT handler {
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics && !Reduction::is_usm>
   parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
-    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                          Redu.getUserAccessor());
+    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
+                                                Redu.getUserAccessor());
   }
 
   /// Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -981,8 +983,8 @@ class __SYCL_EXPORT handler {
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics && Reduction::is_usm>
   parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
-    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                          Redu.getUSMPointer());
+    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
+                                                Redu.getUSMPointer());
   }
 
   /// Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -1002,8 +1004,8 @@ class __SYCL_EXPORT handler {
   parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     auto RWAcc = Redu.getReadWriteScalarAcc(*this);
-    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
-                                          RWAcc);
+    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
+                                                RWAcc);
     this->finalize();
 
     // Copy from RWAcc to user's reduction accessor.
@@ -1030,7 +1032,7 @@ class __SYCL_EXPORT handler {
   /// TODO: Need to handle more than 1 reduction in parallel_for().
   /// TODO: Support HOST. The kernels called by this parallel_for() may use
   /// some functionality that is not yet supported on HOST such as:
-  /// barrier(), and intel::reduce() that also may be used in more
+  /// barrier(), and ext::oneapi::reduce() that also may be used in more
   /// optimized implementations waiting for their turn of code-review.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
@@ -1052,7 +1054,7 @@ class __SYCL_EXPORT handler {
     //    necessary to reduce all partial sums into one final sum.
 
     // 1. Call the kernel that includes user's lambda function.
-    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
+    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     this->finalize();
 
@@ -1082,8 +1084,8 @@ class __SYCL_EXPORT handler {
       // Associate it with handler manually.
       if (NWorkGroups == 1 && !Reduction::is_usm)
         Redu.associateWithHandler(AuxHandler);
-      intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
-                                                           NWorkItems, Redu);
+      ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
+          AuxHandler, Range, NWorkItems, Redu);
       MLastEvent = AuxHandler.finalize();
 
       NWorkItems = NWorkGroups;
@@ -1812,7 +1814,7 @@ class __SYCL_EXPORT handler {
   // in handler from reduction_impl methods.
   template <typename T, class BinaryOperation, int Dims, bool IsUSM,
             access::mode AccMode, access::placeholder IsPlaceholder>
-  friend class intel::detail::reduction_impl;
+  friend class ext::oneapi::detail::reduction_impl;
 
   friend void detail::associateWithHandler(handler &,
                                            detail::AccessorBaseHost *,
diff --git a/sycl/include/CL/sycl/nd_item.hpp b/sycl/include/CL/sycl/nd_item.hpp
index 62abba368dc7f..902003505f6e9 100644
--- a/sycl/include/CL/sycl/nd_item.hpp
+++ b/sycl/include/CL/sycl/nd_item.hpp
@@ -12,9 +12,9 @@
 #include <CL/sycl/access/access.hpp>
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/ext/oneapi/sub_group.hpp>
 #include <CL/sycl/group.hpp>
 #include <CL/sycl/id.hpp>
-#include <CL/sycl/intel/sub_group.hpp>
 #include <CL/sycl/item.hpp>
 #include <CL/sycl/nd_range.hpp>
 #include <CL/sycl/range.hpp>
@@ -67,7 +67,9 @@ template <int dimensions = 1> class nd_item {
 
   group<dimensions> get_group() const { return Group; }
 
-  intel::sub_group get_sub_group() const { return intel::sub_group(); }
+  ext::oneapi::sub_group get_sub_group() const {
+    return ext::oneapi::sub_group();
+  }
 
   size_t ALWAYS_INLINE get_group(int dimension) const {
     size_t Size = Group[dimension];
diff --git a/sycl/include/CL/sycl/pipes.hpp b/sycl/include/CL/sycl/pipes.hpp
index e02fa1155592f..647c9a2a8aee8 100644
--- a/sycl/include/CL/sycl/pipes.hpp
+++ b/sycl/include/CL/sycl/pipes.hpp
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <CL/sycl/intel/pipes.hpp>
+#include <CL/sycl/ext/oneapi/pipes.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 template <class name, class dataT, int32_t min_capacity = 0>
-using pipe = intel::pipe<name, dataT, min_capacity>;
+using pipe = ext::oneapi::pipe<name, dataT, min_capacity>;
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/program.hpp b/sycl/include/CL/sycl/program.hpp
index c6dbebf3f45bf..61137756c2b2a 100644
--- a/sycl/include/CL/sycl/program.hpp
+++ b/sycl/include/CL/sycl/program.hpp
@@ -12,7 +12,7 @@
 #include <CL/sycl/detail/export.hpp>
 #include <CL/sycl/detail/kernel_desc.hpp>
 #include <CL/sycl/detail/os_util.hpp>
-#include <CL/sycl/experimental/spec_constant.hpp>
+#include <CL/sycl/ext/oneapi/spec_constant.hpp>
 #include <CL/sycl/info/info_desc.hpp>
 #include <CL/sycl/kernel.hpp>
 #include <CL/sycl/stl.hpp>
@@ -307,7 +307,7 @@ class __SYCL_EXPORT program {
   /// \return a specialization constant instance corresponding to given type ID
   ///         passed as a template parameter
   template <typename ID, typename T>
-  experimental::spec_constant<T, ID> set_spec_constant(T Cst) {
+  ext::oneapi::spec_constant<T, ID> set_spec_constant(T Cst) {
     constexpr const char *Name = detail::SpecConstantInfo<ID>::getName();
     static_assert(std::is_integral<T>::value ||
                       std::is_floating_point<T>::value,
@@ -315,10 +315,10 @@ class __SYCL_EXPORT program {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)Cst;
     (void)Name;
-    return experimental::spec_constant<T, ID>();
+    return ext::oneapi::spec_constant<T, ID>();
 #else
     set_spec_constant_impl(Name, &Cst, sizeof(T));
-    return experimental::spec_constant<T, ID>(Cst);
+    return ext::oneapi::spec_constant<T, ID>(Cst);
 #endif // __SYCL_DEVICE_ONLY__
   }
 
diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
index ff90729b80367..4177c4d1cd8c0 100644
--- a/sycl/source/detail/program_impl.cpp
+++ b/sycl/source/detail/program_impl.cpp
@@ -479,8 +479,8 @@ vector_class<device> program_impl::get_info<info::program::devices>() const {
 void program_impl::set_spec_constant_impl(const char *Name, const void *ValAddr,
                                           size_t ValSize) {
   if (MState != program_state::none)
-    throw cl::sycl::experimental::spec_const_error("Invalid program state",
-                                                   PI_INVALID_PROGRAM);
+    throw cl::sycl::ext::oneapi::spec_const_error("Invalid program state",
+                                                  PI_INVALID_PROGRAM);
   // Reuse cached programs lock as opposed to introducing a new lock.
   auto LockGuard = MContext->getKernelProgramCache().acquireCachedPrograms();
   spec_constant_impl &SC = SpecConstRegistry[Name];
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 78838d0178ee6..22f1fbfa86a75 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -15,7 +15,7 @@
 #include <CL/sycl/detail/util.hpp>
 #include <CL/sycl/device.hpp>
 #include <CL/sycl/exception.hpp>
-#include <CL/sycl/experimental/spec_constant.hpp>
+#include <CL/sycl/ext/oneapi/spec_constant.hpp>
 #include <CL/sycl/stl.hpp>
 #include <detail/context_impl.hpp>
 #include <detail/device_impl.hpp>
@@ -981,7 +981,7 @@ void ProgramManager::flushSpecConstants(const program_impl &Prg,
       auto LockGuard = Ctx->getKernelProgramCache().acquireCachedPrograms();
       auto It = NativePrograms.find(NativePrg);
       if (It == NativePrograms.end())
-        throw sycl::experimental::spec_const_error(
+        throw sycl::ext::oneapi::spec_const_error(
             "spec constant is set in a program w/o a binary image",
             PI_INVALID_OPERATION);
       Img = It->second;
diff --git a/sycl/source/function_pointer.cpp b/sycl/source/function_pointer.cpp
index c273ae817c8bf..b46fd533fadd8 100644
--- a/sycl/source/function_pointer.cpp
+++ b/sycl/source/function_pointer.cpp
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl/intel/function_pointer.hpp>
+#include <CL/sycl/ext/oneapi/function_pointer.hpp>
 #include <detail/device_impl.hpp>
 #include <detail/program_impl.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
-intel::device_func_ptr_holder_t
+ext::oneapi::device_func_ptr_holder_t
 getDeviceFunctionPointerImpl(device &D, program &P, const char *FuncName) {
-  intel::device_func_ptr_holder_t FPtr = 0;
+  ext::oneapi::device_func_ptr_holder_t FPtr = 0;
   // FIXME: return value must be checked here, but since we cannot yet check
   // if corresponding extension is supported, let's silently ignore it here.
   const detail::plugin &Plugin = detail::getSyclObjImpl(P)->getPlugin();
diff --git a/sycl/test/atomic_ref/add.cpp b/sycl/test/atomic_ref/add.cpp
index b152166e4f966..6face7c749bf5 100644
--- a/sycl/test/atomic_ref/add.cpp
+++ b/sycl/test/atomic_ref/add.cpp
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 void add_fetch_test(queue q, size_t N) {
@@ -26,7 +26,7 @@ void add_fetch_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
         out[gid] = atm.fetch_add(T(1));
       });
     });
@@ -59,7 +59,7 @@ void add_plus_equal_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
         out[gid] = atm += T(1);
       });
     });
@@ -92,7 +92,7 @@ void add_pre_inc_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
         out[gid] = ++atm;
       });
     });
@@ -125,7 +125,7 @@ void add_post_inc_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
         out[gid] = atm++;
       });
     });
diff --git a/sycl/test/atomic_ref/compare_exchange.cpp b/sycl/test/atomic_ref/compare_exchange.cpp
index 8f563fccb65fd..31290418a144b 100644
--- a/sycl/test/atomic_ref/compare_exchange.cpp
+++ b/sycl/test/atomic_ref/compare_exchange.cpp
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 class compare_exchange_kernel;
@@ -29,7 +29,7 @@ void compare_exchange_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<compare_exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(exc[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(exc[0]);
         T result = initial;
         bool success = atm.compare_exchange_strong(result, (T)gid);
         if (success) {
diff --git a/sycl/test/atomic_ref/exchange.cpp b/sycl/test/atomic_ref/exchange.cpp
index 2ce1292cfdd55..bba5dae8e29b3 100644
--- a/sycl/test/atomic_ref/exchange.cpp
+++ b/sycl/test/atomic_ref/exchange.cpp
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 class exchange_kernel;
@@ -29,7 +29,7 @@ void exchange_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(exc[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(exc[0]);
         out[gid] = atm.exchange(gid);
       });
     });
diff --git a/sycl/test/atomic_ref/load.cpp b/sycl/test/atomic_ref/load.cpp
index 274191b9a5ac3..4d95c4a5f1858 100644
--- a/sycl/test/atomic_ref/load.cpp
+++ b/sycl/test/atomic_ref/load.cpp
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 class load_kernel;
@@ -29,7 +29,7 @@ void load_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<load_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(ld[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(ld[0]);
         out[gid] = atm.load();
       });
     });
diff --git a/sycl/test/atomic_ref/max.cpp b/sycl/test/atomic_ref/max.cpp
index c8bccf1c28067..7be6b9ac392b5 100644
--- a/sycl/test/atomic_ref/max.cpp
+++ b/sycl/test/atomic_ref/max.cpp
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 void max_test(queue q, size_t N) {
@@ -27,7 +27,7 @@ void max_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
 
         // +1 accounts for lowest() returning 0 for unsigned types
         out[gid] = atm.fetch_max(T(gid) + 1);
diff --git a/sycl/test/atomic_ref/min.cpp b/sycl/test/atomic_ref/min.cpp
index 8313c4931136c..47787a52b2eea 100644
--- a/sycl/test/atomic_ref/min.cpp
+++ b/sycl/test/atomic_ref/min.cpp
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 void min_test(queue q, size_t N) {
@@ -27,7 +27,7 @@ void min_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_min(T(gid));
       });
     });
diff --git a/sycl/test/atomic_ref/store.cpp b/sycl/test/atomic_ref/store.cpp
index eebdba5ced095..69abba8e0580d 100644
--- a/sycl/test/atomic_ref/store.cpp
+++ b/sycl/test/atomic_ref/store.cpp
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 class store_kernel;
@@ -24,7 +24,7 @@ void store_test(queue q, size_t N) {
       auto st = store_buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<store_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(st[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(st[0]);
         atm.store(T(gid));
       });
     });
diff --git a/sycl/test/atomic_ref/sub.cpp b/sycl/test/atomic_ref/sub.cpp
index 52e338048e7be..13ed2c5bdafbe 100644
--- a/sycl/test/atomic_ref/sub.cpp
+++ b/sycl/test/atomic_ref/sub.cpp
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <typename T>
 void sub_fetch_test(queue q, size_t N) {
@@ -26,7 +26,7 @@ void sub_fetch_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_sub(T(1));
       });
     });
@@ -59,7 +59,7 @@ void sub_plus_equal_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
         out[gid] = atm -= T(1);
       });
     });
@@ -92,7 +92,7 @@ void sub_pre_dec_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
         out[gid] = --atm;
       });
     });
@@ -125,7 +125,7 @@ void sub_post_dec_test(queue q, size_t N) {
       auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
         out[gid] = atm--;
       });
     });
diff --git a/sycl/test/basic_tests/esimd/block_load_store.cpp b/sycl/test/basic_tests/esimd/block_load_store.cpp
index 268df2c7991c6..b83f4f0378a02 100644
--- a/sycl/test/basic_tests/esimd/block_load_store.cpp
+++ b/sycl/test/basic_tests/esimd/block_load_store.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel(accessor<int, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/esimd_math.cpp b/sycl/test/basic_tests/esimd/esimd_math.cpp
index bada49639b366..e037ebb1c3c84 100644
--- a/sycl/test/basic_tests/esimd/esimd_math.cpp
+++ b/sycl/test/basic_tests/esimd/esimd_math.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 
 bool test_esimd_mask() __attribute__((sycl_device)) {
   simd<ushort, 16> a(0);
diff --git a/sycl/test/basic_tests/esimd/flat_atomic.cpp b/sycl/test/basic_tests/esimd/flat_atomic.cpp
index 03bf1742015ed..eb8bbc1293a28 100644
--- a/sycl/test/basic_tests/esimd/flat_atomic.cpp
+++ b/sycl/test/basic_tests/esimd/flat_atomic.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel0(accessor<uint32_t, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/gather4_scatter4.cpp b/sycl/test/basic_tests/esimd/gather4_scatter4.cpp
index 0e912f393afd3..5627a8757545d 100644
--- a/sycl/test/basic_tests/esimd/gather4_scatter4.cpp
+++ b/sycl/test/basic_tests/esimd/gather4_scatter4.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel(accessor<int, 1, access::mode::read_write,
diff --git a/sycl/test/basic_tests/esimd/gather_scatter.cpp b/sycl/test/basic_tests/esimd/gather_scatter.cpp
index d0c83ef7606e9..65677ed922ce2 100644
--- a/sycl/test/basic_tests/esimd/gather_scatter.cpp
+++ b/sycl/test/basic_tests/esimd/gather_scatter.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel(accessor<int, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/global_var.cpp b/sycl/test/basic_tests/esimd/global_var.cpp
index e2b52fc51241a..87694cc844da7 100644
--- a/sycl/test/basic_tests/esimd/global_var.cpp
+++ b/sycl/test/basic_tests/esimd/global_var.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 
 // This test checks that DPC++ compiler in ESIMD mode understands
 // the ESIMD_PRIVATE and ESIMD_REGISTER macros
diff --git a/sycl/test/basic_tests/esimd/simd.cpp b/sycl/test/basic_tests/esimd/simd.cpp
index 5eb82677e0691..b152fd55b1b6a 100644
--- a/sycl/test/basic_tests/esimd/simd.cpp
+++ b/sycl/test/basic_tests/esimd/simd.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 
 bool test_simd_ctors() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
diff --git a/sycl/test/basic_tests/esimd/simd_merge.cpp b/sycl/test/basic_tests/esimd/simd_merge.cpp
index 20c357188be79..b6b0ccea3841e 100644
--- a/sycl/test/basic_tests/esimd/simd_merge.cpp
+++ b/sycl/test/basic_tests/esimd/simd_merge.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 
 bool test_simd_merge1() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
diff --git a/sycl/test/basic_tests/esimd/simd_view.cpp b/sycl/test/basic_tests/esimd/simd_view.cpp
index 0910dace1c98a..e30829ff170e2 100644
--- a/sycl/test/basic_tests/esimd/simd_view.cpp
+++ b/sycl/test/basic_tests/esimd/simd_view.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 
 bool test_simd_view_ctors() __attribute__((sycl_device)) {
   simd<int, 16> v0(0, 1);
diff --git a/sycl/test/basic_tests/esimd/slm_atomic.cpp b/sycl/test/basic_tests/esimd/slm_atomic.cpp
index b7f094075e147..b8b93cdaaf148 100644
--- a/sycl/test/basic_tests/esimd/slm_atomic.cpp
+++ b/sycl/test/basic_tests/esimd/slm_atomic.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel0() __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/slm_block.cpp b/sycl/test/basic_tests/esimd/slm_block.cpp
index a49083f5c6524..eb3c23ab3cd62 100644
--- a/sycl/test/basic_tests/esimd/slm_block.cpp
+++ b/sycl/test/basic_tests/esimd/slm_block.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel() __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/slm_load.cpp b/sycl/test/basic_tests/esimd/slm_load.cpp
index a84dce7b25f7c..a28d4f1d3411a 100644
--- a/sycl/test/basic_tests/esimd/slm_load.cpp
+++ b/sycl/test/basic_tests/esimd/slm_load.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel() __attribute__((sycl_device)) {
diff --git a/sycl/test/basic_tests/esimd/slm_load4.cpp b/sycl/test/basic_tests/esimd/slm_load4.cpp
index f1f483e72c214..c6ea7074772c6 100644
--- a/sycl/test/basic_tests/esimd/slm_load4.cpp
+++ b/sycl/test/basic_tests/esimd/slm_load4.cpp
@@ -2,11 +2,11 @@
 // expected-no-diagnostics
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/esimd.hpp>
+#include <CL/sycl/ext/intel/esimd.hpp>
 #include <limits>
 #include <utility>
 
-using namespace sycl::intel::gpu;
+using namespace sycl::ext::intel::gpu;
 using namespace cl::sycl;
 
 void kernel() __attribute__((sycl_device)) {
diff --git a/sycl/test/built-ins/printf.cpp b/sycl/test/built-ins/printf.cpp
index 6536498587729..fe1a416b9550e 100644
--- a/sycl/test/built-ins/printf.cpp
+++ b/sycl/test/built-ins/printf.cpp
@@ -41,7 +41,7 @@ int main() {
     Queue.submit([&](handler &CGH) {
       CGH.single_task<class integral>([=]() {
         // String
-        intel::experimental::printf(format_hello_world);
+        ext::oneapi::printf(format_hello_world);
         // Due to a bug in Intel CPU Runtime for OpenCL on Windows, information
         // printed using such format strings (without %-specifiers) might
         // appear in different order if output is redirected to a file or
@@ -50,8 +50,8 @@ int main() {
         // CHECK: {{(Hello, World!)?}}
 
         // Integral types
-        intel::experimental::printf(format_int, (int32_t)123);
-        intel::experimental::printf(format_int, (int32_t)-123);
+        ext::oneapi::printf(format_int, (int32_t)123);
+        ext::oneapi::printf(format_int, (int32_t)-123);
         // CHECK: 123
         // CHECK-NEXT: -123
 
@@ -60,8 +60,8 @@ int main() {
           // You can declare format string in non-global scope, but in this case
           // static keyword is required
           static const CONSTANT char format[] = "%f\n";
-          intel::experimental::printf(format, 33.4f);
-          intel::experimental::printf(format, -33.4f);
+          ext::oneapi::printf(format, 33.4f);
+          ext::oneapi::printf(format, -33.4f);
         }
         // CHECK-NEXT: 33.4
         // CHECK-NEXT: -33.4
@@ -73,21 +73,21 @@ int main() {
         using ocl_int4 = cl::sycl::vec<int, 4>::vector_t;
         {
           static const CONSTANT char format[] = "%v4d\n";
-          intel::experimental::printf(format, (ocl_int4)v4);
+          ext::oneapi::printf(format, (ocl_int4)v4);
         }
 
         // However, you are still able to print them by-element:
         {
-          intel::experimental::printf(format_vec, (int32_t)v4.w(),
+          ext::oneapi::printf(format_vec, (int32_t)v4.w(),
                                       (int32_t)v4.z(), (int32_t)v4.y(),
                                       (int32_t)v4.x());
         }
 #else
         // On host side you always have to print them by-element:
-        intel::experimental::printf(format_vec, (int32_t)v4.x(),
+        ext::oneapi::printf(format_vec, (int32_t)v4.x(),
                                     (int32_t)v4.y(), (int32_t)v4.z(),
                                     (int32_t)v4.w());
-        intel::experimental::printf(format_vec, (int32_t)v4.w(),
+        ext::oneapi::printf(format_vec, (int32_t)v4.w(),
                                     (int32_t)v4.z(), (int32_t)v4.y(),
                                     (int32_t)v4.x());
 #endif // __SYCL_DEVICE_ONLY__
@@ -100,7 +100,7 @@ int main() {
         // According to OpenCL spec, argument should be a void pointer
         {
           static const CONSTANT char format[] = "%p\n";
-          intel::experimental::printf(format, (void *)Ptr);
+          ext::oneapi::printf(format, (void *)Ptr);
         }
         // CHECK-NEXT: {{(0x)?[0-9a-fA-F]+$}}
       });
@@ -111,7 +111,7 @@ int main() {
     Queue.submit([&](handler &CGH) {
       CGH.parallel_for<class stream_string>(range<1>(10), [=](id<1> i) {
         // cast to uint64_t to be sure that we pass 64-bit unsigned value
-        intel::experimental::printf(format_hello_world_2, (uint64_t)i.get(0));
+        ext::oneapi::printf(format_hello_world_2, (uint64_t)i.get(0));
       });
     });
     Queue.wait();
diff --git a/sycl/test/built-ins/scalar_integer.cpp b/sycl/test/built-ins/scalar_integer.cpp
index 18dd76294022a..85a0eae294d5f 100644
--- a/sycl/test/built-ins/scalar_integer.cpp
+++ b/sycl/test/built-ins/scalar_integer.cpp
@@ -245,7 +245,7 @@ int main() {
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class ctzSI1>([=]() {
-          AccR[0] = s::intel::ctz(s::cl_int{ 0x7FFFFFF0 });
+          AccR[0] = s::ext::oneapi::ctz(s::cl_int{ 0x7FFFFFF0 });
         });
       });
     }
diff --git a/sycl/test/built-ins/vector_integer.cpp b/sycl/test/built-ins/vector_integer.cpp
index 5a3a3dd0c80cf..74db909d278bb 100644
--- a/sycl/test/built-ins/vector_integer.cpp
+++ b/sycl/test/built-ins/vector_integer.cpp
@@ -387,7 +387,7 @@ int main() {
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class ctzSI2>([=]() {
-          AccR[0] = s::intel::ctz(s::cl_int2{ 0x7FFFFFF0, 0x7FFFFFF0 });
+          AccR[0] = s::ext::oneapi::ctz(s::cl_int2{ 0x7FFFFFF0, 0x7FFFFFF0 });
         });
       });
     }
diff --git a/sycl/test/function-pointers/fp-as-kernel-arg.cpp b/sycl/test/function-pointers/fp-as-kernel-arg.cpp
index c68a891dcf94c..ba76fdf5dbda8 100644
--- a/sycl/test/function-pointers/fp-as-kernel-arg.cpp
+++ b/sycl/test/function-pointers/fp-as-kernel-arg.cpp
@@ -31,7 +31,7 @@ int main() {
   P.build_with_kernel_type<class K>();
   cl::sycl::kernel KE = P.get_kernel<class K>();
 
-  auto FptrStorage = cl::sycl::intel::get_device_func_ptr(&add, "add", P, D);
+  auto FptrStorage = cl::sycl::ext::oneapi::get_device_func_ptr(&add, "add", P, D);
   if (!D.is_host()) {
     // FIXME: update this check with query to supported extension
     // For now, we don't have runtimes that report required OpenCL extension and
@@ -55,7 +55,7 @@ int main() {
     CGH.parallel_for<class K>(
         KE, cl::sycl::range<1>(Size), [=](cl::sycl::id<1> Index) {
       auto Fptr =
-          cl::sycl::intel::to_device_func_ptr<decltype(add)>(FptrStorage);
+          cl::sycl::ext::oneapi::to_device_func_ptr<decltype(add)>(FptrStorage);
       AccA[Index] = Fptr(AccA[Index], AccB[Index]);
     });
   });
diff --git a/sycl/test/function-pointers/pass-fp-through-buffer.cpp b/sycl/test/function-pointers/pass-fp-through-buffer.cpp
index aa2bc85ec9874..255bbb0212eeb 100644
--- a/sycl/test/function-pointers/pass-fp-through-buffer.cpp
+++ b/sycl/test/function-pointers/pass-fp-through-buffer.cpp
@@ -33,12 +33,12 @@ int main() {
   P.build_with_kernel_type<class K>();
   cl::sycl::kernel KE = P.get_kernel<class K>();
 
-  cl::sycl::buffer<cl::sycl::intel::device_func_ptr_holder_t> DispatchTable(2);
+  cl::sycl::buffer<cl::sycl::ext::oneapi::device_func_ptr_holder_t> DispatchTable(2);
   {
     auto DTAcc =
         DispatchTable.get_access<cl::sycl::access::mode::discard_write>();
-    DTAcc[0] = cl::sycl::intel::get_device_func_ptr(&add, "add", P, D);
-    DTAcc[1] = cl::sycl::intel::get_device_func_ptr(&sub, "sub", P, D);
+    DTAcc[0] = cl::sycl::ext::oneapi::get_device_func_ptr(&add, "add", P, D);
+    DTAcc[1] = cl::sycl::ext::oneapi::get_device_func_ptr(&sub, "sub", P, D);
     if (!D.is_host()) {
       // FIXME: update this check with query to supported extension
       // For now, we don't have runtimes that report required OpenCL extension
@@ -69,7 +69,7 @@ int main() {
       CGH.parallel_for<class K>(
           KE, cl::sycl::range<1>(Size), [=](cl::sycl::id<1> Index) {
         auto FP =
-            cl::sycl::intel::to_device_func_ptr<int(int, int)>(AccDT[Mode]);
+            cl::sycl::ext::oneapi::to_device_func_ptr<int(int, int)>(AccDT[Mode]);
 
         AccA[Index] = FP(AccA[Index], AccB[Index]);
       });
diff --git a/sycl/test/group-algorithm/all_of.cpp b/sycl/test/group-algorithm/all_of.cpp
index 2a175d000bb6f..385b8f581a424 100644
--- a/sycl/test/group-algorithm/all_of.cpp
+++ b/sycl/test/group-algorithm/all_of.cpp
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <numeric>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class Predicate>
 class all_of_kernel;
diff --git a/sycl/test/group-algorithm/any_of.cpp b/sycl/test/group-algorithm/any_of.cpp
index 6ce61afaffdec..db7acd83ce624 100644
--- a/sycl/test/group-algorithm/any_of.cpp
+++ b/sycl/test/group-algorithm/any_of.cpp
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <numeric>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class Predicate>
 class any_of_kernel;
diff --git a/sycl/test/group-algorithm/broadcast.cpp b/sycl/test/group-algorithm/broadcast.cpp
index df0887a40d4a0..c3f26be96b630 100644
--- a/sycl/test/group-algorithm/broadcast.cpp
+++ b/sycl/test/group-algorithm/broadcast.cpp
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <numeric>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 class broadcast_kernel;
 
diff --git a/sycl/test/group-algorithm/exclusive_scan.cpp b/sycl/test/group-algorithm/exclusive_scan.cpp
index 47dc1f6122720..537d2117b5bc7 100644
--- a/sycl/test/group-algorithm/exclusive_scan.cpp
+++ b/sycl/test/group-algorithm/exclusive_scan.cpp
@@ -14,7 +14,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class BinaryOperation, int TestNumber>
 class exclusive_scan_kernel;
diff --git a/sycl/test/group-algorithm/inclusive_scan.cpp b/sycl/test/group-algorithm/inclusive_scan.cpp
index 54311a162ed9e..8e718d82ae22f 100644
--- a/sycl/test/group-algorithm/inclusive_scan.cpp
+++ b/sycl/test/group-algorithm/inclusive_scan.cpp
@@ -14,7 +14,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class BinaryOperation, int TestNumber>
 class inclusive_scan_kernel;
diff --git a/sycl/test/group-algorithm/leader.cpp b/sycl/test/group-algorithm/leader.cpp
index ff02cf7e77f9e..b41c4b3ab7c4d 100644
--- a/sycl/test/group-algorithm/leader.cpp
+++ b/sycl/test/group-algorithm/leader.cpp
@@ -10,7 +10,7 @@
 #include <CL/sycl.hpp>
 #include <cassert>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 class leader_kernel;
 
diff --git a/sycl/test/group-algorithm/none_of.cpp b/sycl/test/group-algorithm/none_of.cpp
index c8b56158d20b7..e3ef07030ceab 100644
--- a/sycl/test/group-algorithm/none_of.cpp
+++ b/sycl/test/group-algorithm/none_of.cpp
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <numeric>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class Predicate>
 class none_of_kernel;
diff --git a/sycl/test/group-algorithm/reduce.cpp b/sycl/test/group-algorithm/reduce.cpp
index 64ed0bd82fcc2..251f3fbb1225a 100644
--- a/sycl/test/group-algorithm/reduce.cpp
+++ b/sycl/test/group-algorithm/reduce.cpp
@@ -13,7 +13,7 @@
 #include <limits>
 #include <numeric>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 template <class BinaryOperation>
 class reduce_kernel;
diff --git a/sycl/test/linear_id/linear-sub_group.cpp b/sycl/test/linear_id/linear-sub_group.cpp
index 2b3f75ae2182e..b354dd186a5d2 100644
--- a/sycl/test/linear_id/linear-sub_group.cpp
+++ b/sycl/test/linear_id/linear-sub_group.cpp
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
           nd_range<2>(range<2>(outer, inner), range<2>(outer, inner)),
           [=](nd_item<2> it) {
             id<2> idx = it.get_global_id();
-            intel::sub_group sg = it.get_sub_group();
+            ext::oneapi::sub_group sg = it.get_sub_group();
             output[idx] = sg.get_group_id()[0] * sg.get_local_range()[0] +
                           sg.get_local_id()[0];
           });
diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp
index 7f8e8e9726e59..35ac1266cb11c 100644
--- a/sycl/test/reduction/reduction_ctor.cpp
+++ b/sycl/test/reduction/reduction_ctor.cpp
@@ -77,7 +77,7 @@ void testKnown(T Identity, T A, T B) {
     // This accessor is not really used in this test.
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         ReduAcc(ReduBuf, CGH);
-    auto Redu = intel::reduction(ReduAcc, BOp);
+    auto Redu = ext::oneapi::reduction(ReduAcc, BOp);
     assert(Redu.getIdentity() == Identity &&
            "Failed getIdentity() check().");
     test_reducer(Redu, A, B);
@@ -99,7 +99,7 @@ void testUnknown(T Identity, T A, T B) {
     // This accessor is not really used in this test.
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         ReduAcc(ReduBuf, CGH);
-    auto Redu = intel::reduction(ReduAcc, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(ReduAcc, Identity, BOp);
     assert(Redu.getIdentity() == Identity &&
            "Failed getIdentity() check().");
     test_reducer(Redu, Identity, A, B);
@@ -119,18 +119,18 @@ void testBoth(T Identity, T A, T B) {
 
 int main() {
   // testKnown does not pass identity to reduction ctor.
-  testBoth<int, intel::plus<int>>(0, 1, 7);
+  testBoth<int, ext::oneapi::plus<int>>(0, 1, 7);
   testBoth<int, std::multiplies<int>>(1, 1, 7);
-  testBoth<int, intel::bit_or<int>>(0, 1, 8);
-  testBoth<int, intel::bit_xor<int>>(0, 7, 3);
-  testBoth<int, intel::bit_and<int>>(~0, 7, 3);
-  testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
-  testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
+  testBoth<int, ext::oneapi::bit_or<int>>(0, 1, 8);
+  testBoth<int, ext::oneapi::bit_xor<int>>(0, 7, 3);
+  testBoth<int, ext::oneapi::bit_and<int>>(~0, 7, 3);
+  testBoth<int, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
+  testBoth<int, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
 
-  testBoth<float, intel::plus<float>>(0, 1, 7);
+  testBoth<float, ext::oneapi::plus<float>>(0, 1, 7);
   testBoth<float, std::multiplies<float>>(1, 1, 7);
-  testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
-  testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
+  testBoth<float, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
+  testBoth<float, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
 
   testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
   testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp
index c700097993079..49a5e3d799262 100644
--- a/sycl/test/reduction/reduction_nd_conditional.cpp
+++ b/sycl/test/reduction/reduction_nd_conditional.cpp
@@ -85,7 +85,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -114,10 +114,10 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 }
 
 int main() {
-  test<int, 0, intel::plus<int>>(0, 2, 2);
-  test<int, 1, intel::plus<int>>(0, 7, 7);
-  test<int, 0, intel::plus<int>>(0, 2, 64);
-  test<short, 1, intel::plus<short>>(0, 16, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 2);
+  test<int, 1, ext::oneapi::plus<int>>(0, 7, 7);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 64);
+  test<short, 1, ext::oneapi::plus<short>>(0, 16, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_ext_type.hpp b/sycl/test/reduction/reduction_nd_ext_type.hpp
index a80aefc09cd45..9cdfbb8dce2fc 100644
--- a/sycl/test/reduction/reduction_nd_ext_type.hpp
+++ b/sycl/test/reduction/reduction_nd_ext_type.hpp
@@ -30,7 +30,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, Mode, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -64,15 +64,15 @@ int runTests(const string_class &ExtensionName) {
 
   // Check some less standards WG sizes and corner cases first.
   test<T, 1, access::mode::read_write, std::multiplies<T>>(0, 4, 4);
-  test<T, 0, access::mode::discard_write, intel::plus<T>>(0, 4, 64);
+  test<T, 0, access::mode::discard_write, ext::oneapi::plus<T>>(0, 4, 64);
 
-  test<T, 0, access::mode::read_write, intel::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
-  test<T, 1, access::mode::discard_write, intel::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
+  test<T, 0, access::mode::read_write, ext::oneapi::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
+  test<T, 1, access::mode::discard_write, ext::oneapi::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
 
 #if __cplusplus >= 201402L
-  test<T, 1, access::mode::read_write, intel::plus<>>(1, 3, 3 * 5);
-  test<T, 1, access::mode::discard_write, intel::minimum<>>(getMaximumFPValue<T>(), 3, 3);
-  test<T, 0, access::mode::discard_write, intel::maximum<>>(getMinimumFPValue<T>(), 3, 3);
+  test<T, 1, access::mode::read_write, ext::oneapi::plus<>>(1, 3, 3 * 5);
+  test<T, 1, access::mode::discard_write, ext::oneapi::minimum<>>(getMaximumFPValue<T>(), 3, 3);
+  test<T, 0, access::mode::discard_write, ext::oneapi::maximum<>>(getMinimumFPValue<T>(), 3, 3);
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
index 834ccf4407649..356038d9f38b6 100644
--- a/sycl/test/reduction/reduction_nd_s0_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -35,7 +35,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -59,33 +59,33 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // Check some less standards WG sizes and corner cases first.
-  test<int, 0, intel::plus<int>>(0, 2, 2);
-  test<int, 0, intel::plus<int>>(0, 7, 7);
-  test<int, 0, intel::plus<int>>(0, 9, 18);
-  test<int, 0, intel::plus<int>>(0, 49, 49 * 5);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 2);
+  test<int, 0, ext::oneapi::plus<int>>(0, 7, 7);
+  test<int, 0, ext::oneapi::plus<int>>(0, 9, 18);
+  test<int, 0, ext::oneapi::plus<int>>(0, 49, 49 * 5);
 
   // Try some power-of-two work-group sizes.
-  test<int, 0, intel::plus<int>>(0, 2, 64);
-  test<int, 0, intel::plus<int>>(0, 4, 64);
-  test<int, 0, intel::plus<int>>(0, 8, 128);
-  test<int, 0, intel::plus<int>>(0, 16, 256);
-  test<int, 0, intel::plus<int>>(0, 32, 256);
-  test<int, 0, intel::plus<int>>(0, 64, 256);
-  test<int, 0, intel::plus<int>>(0, 128, 256);
-  test<int, 0, intel::plus<int>>(0, 256, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 64);
+  test<int, 0, ext::oneapi::plus<int>>(0, 4, 64);
+  test<int, 0, ext::oneapi::plus<int>>(0, 8, 128);
+  test<int, 0, ext::oneapi::plus<int>>(0, 16, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 32, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 64, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 128, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 256, 256);
 
   // Check with various operations.
   test<int, 0, std::multiplies<int>>(1, 8, 256);
-  test<int, 0, intel::bit_or<int>>(0, 8, 256);
-  test<int, 0, intel::bit_xor<int>>(0, 8, 256);
-  test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 0, ext::oneapi::bit_or<int>>(0, 8, 256);
+  test<int, 0, ext::oneapi::bit_xor<int>>(0, 8, 256);
+  test<int, 0, ext::oneapi::bit_and<int>>(~0, 8, 256);
+  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
index 2040b632e07fb..88f408a1c4f8d 100644
--- a/sycl/test/reduction/reduction_nd_s0_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -37,7 +37,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::read_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -61,33 +61,33 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // Check some less standards WG sizes and corner cases first.
-  test<int, 0, intel::plus<int>>(0, 2, 2);
-  test<int, 0, intel::plus<int>>(0, 7, 7);
-  test<int, 0, intel::plus<int>>(0, 9, 18);
-  test<int, 0, intel::plus<int>>(0, 49, 49 * 5);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 2);
+  test<int, 0, ext::oneapi::plus<int>>(0, 7, 7);
+  test<int, 0, ext::oneapi::plus<int>>(0, 9, 18);
+  test<int, 0, ext::oneapi::plus<int>>(0, 49, 49 * 5);
 
   // Try some power-of-two work-group sizes.
-  test<int, 0, intel::plus<int>>(0, 2, 64);
-  test<int, 0, intel::plus<int>>(0, 4, 64);
-  test<int, 0, intel::plus<int>>(0, 8, 128);
-  test<int, 0, intel::plus<int>>(0, 16, 256);
-  test<int, 0, intel::plus<int>>(0, 32, 256);
-  test<int, 0, intel::plus<int>>(0, 64, 256);
-  test<int, 0, intel::plus<int>>(0, 128, 256);
-  test<int, 0, intel::plus<int>>(0, 256, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 2, 64);
+  test<int, 0, ext::oneapi::plus<int>>(0, 4, 64);
+  test<int, 0, ext::oneapi::plus<int>>(0, 8, 128);
+  test<int, 0, ext::oneapi::plus<int>>(0, 16, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 32, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 64, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 128, 256);
+  test<int, 0, ext::oneapi::plus<int>>(0, 256, 256);
 
   // Check with various operations.
   test<int, 0, std::multiplies<int>>(1, 8, 256);
-  test<int, 0, intel::bit_or<int>>(0, 8, 256);
-  test<int, 0, intel::bit_xor<int>>(0, 8, 256);
-  test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 0, ext::oneapi::bit_or<int>>(0, 8, 256);
+  test<int, 0, ext::oneapi::bit_xor<int>>(0, 8, 256);
+  test<int, 0, ext::oneapi::bit_and<int>>(~0, 8, 256);
+  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
index 9fe36d69daa8c..68b8e7cafb811 100644
--- a/sycl/test/reduction/reduction_nd_s1_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -36,7 +36,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -60,33 +60,33 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // Check some less standards WG sizes and corner cases first.
-  test<int, 1, intel::plus<int>>(0, 2, 2);
-  test<int, 1, intel::plus<int>>(0, 7, 7);
-  test<int, 1, intel::plus<int>>(0, 9, 18);
-  test<int, 1, intel::plus<int>>(0, 49, 49 * 5);
+  test<int, 1, ext::oneapi::plus<int>>(0, 2, 2);
+  test<int, 1, ext::oneapi::plus<int>>(0, 7, 7);
+  test<int, 1, ext::oneapi::plus<int>>(0, 9, 18);
+  test<int, 1, ext::oneapi::plus<int>>(0, 49, 49 * 5);
 
   // Try some power-of-two work-group sizes.
-  test<int, 1, intel::plus<int>>(0, 2, 64);
-  test<int, 1, intel::plus<int>>(0, 4, 64);
-  test<int, 1, intel::plus<int>>(0, 8, 128);
-  test<int, 1, intel::plus<int>>(0, 16, 256);
-  test<int, 1, intel::plus<int>>(0, 32, 256);
-  test<int, 1, intel::plus<int>>(0, 64, 256);
-  test<int, 1, intel::plus<int>>(0, 128, 256);
-  test<int, 1, intel::plus<int>>(0, 256, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 2, 64);
+  test<int, 1, ext::oneapi::plus<int>>(0, 4, 64);
+  test<int, 1, ext::oneapi::plus<int>>(0, 8, 128);
+  test<int, 1, ext::oneapi::plus<int>>(0, 16, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 32, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 64, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 128, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 256, 256);
 
   // Check with various operations.
   test<int, 1, std::multiplies<int>>(1, 8, 256);
-  test<int, 1, intel::bit_or<int>>(0, 8, 256);
-  test<int, 1, intel::bit_xor<int>>(0, 8, 256);
-  test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 1, ext::oneapi::bit_or<int>>(0, 8, 256);
+  test<int, 1, ext::oneapi::bit_xor<int>>(0, 8, 256);
+  test<int, 1, ext::oneapi::bit_and<int>>(~0, 8, 256);
+  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
index e59ed8c4785a5..64ddc1371b070 100644
--- a/sycl/test/reduction/reduction_nd_s1_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -38,7 +38,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::read_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
@@ -62,33 +62,33 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // Check some less standards WG sizes and corner cases first.
-  test<int, 1, intel::plus<int>>(0, 2, 2);
-  test<int, 1, intel::plus<int>>(0, 7, 7);
-  test<int, 1, intel::plus<int>>(0, 9, 18);
-  test<int, 1, intel::plus<int>>(0, 49, 49 * 5);
+  test<int, 1, ext::oneapi::plus<int>>(0, 2, 2);
+  test<int, 1, ext::oneapi::plus<int>>(0, 7, 7);
+  test<int, 1, ext::oneapi::plus<int>>(0, 9, 18);
+  test<int, 1, ext::oneapi::plus<int>>(0, 49, 49 * 5);
 
   // Try some power-of-two work-group sizes.
-  test<int, 1, intel::plus<int>>(0, 2, 64);
-  test<int, 1, intel::plus<int>>(0, 4, 64);
-  test<int, 1, intel::plus<int>>(0, 8, 128);
-  test<int, 1, intel::plus<int>>(0, 16, 256);
-  test<int, 1, intel::plus<int>>(0, 32, 256);
-  test<int, 1, intel::plus<int>>(0, 64, 256);
-  test<int, 1, intel::plus<int>>(0, 128, 256);
-  test<int, 1, intel::plus<int>>(0, 256, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 2, 64);
+  test<int, 1, ext::oneapi::plus<int>>(0, 4, 64);
+  test<int, 1, ext::oneapi::plus<int>>(0, 8, 128);
+  test<int, 1, ext::oneapi::plus<int>>(0, 16, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 32, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 64, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 128, 256);
+  test<int, 1, ext::oneapi::plus<int>>(0, 256, 256);
 
   // Check with various operations.
   test<int, 1, std::multiplies<int>>(1, 8, 256);
-  test<int, 1, intel::bit_or<int>>(0, 8, 256);
-  test<int, 1, intel::bit_xor<int>>(0, 8, 256);
-  test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 1, ext::oneapi::bit_or<int>>(0, 8, 256);
+  test<int, 1, ext::oneapi::bit_xor<int>>(0, 8, 256);
+  test<int, 1, ext::oneapi::bit_and<int>>(~0, 8, 256);
+  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp
index e972105bbab50..42d19f9d8025a 100644
--- a/sycl/test/reduction/reduction_placeholder.cpp
+++ b/sycl/test/reduction/reduction_placeholder.cpp
@@ -41,7 +41,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
   Q.submit([&](handler &CGH) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     CGH.require(Out);
-    auto Redu = intel::reduction(Out, Identity, BinaryOperation());
+    auto Redu = ext::oneapi::reduction(Out, Identity, BinaryOperation());
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
@@ -64,16 +64,16 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // fast atomics and fast reduce
-  test<int, 1, intel::plus<int>>(0, 49, 49 * 5);
-  test<int, 0, intel::plus<int>>(0, 8, 8);
+  test<int, 1, ext::oneapi::plus<int>>(0, 49, 49 * 5);
+  test<int, 0, ext::oneapi::plus<int>>(0, 8, 8);
 
   // fast atomics
-  test<int, 0, intel::bit_or<int>>(0, 7, 7 * 3);
-  test<int, 1, intel::bit_or<int>>(0, 4, 128);
+  test<int, 0, ext::oneapi::bit_or<int>>(0, 7, 7 * 3);
+  test<int, 1, ext::oneapi::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
-  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
index ee44ecd37998c..31aa24de49622 100644
--- a/sycl/test/reduction/reduction_transparent.cpp
+++ b/sycl/test/reduction/reduction_transparent.cpp
@@ -46,7 +46,7 @@ void testId(T Identity, size_t WGSize, size_t NWItems) {
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
     CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
-        NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+        NDRange, ext::oneapi::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
@@ -86,7 +86,7 @@ void testNoId(T Identity, size_t WGSize, size_t NWItems) {
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
     CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
-        NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+        NDRange, ext::oneapi::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
@@ -110,8 +110,8 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
 #if __cplusplus >= 201402L
-  test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
-  test<signed char, 0, intel::plus<>>(0, 7, 49);
+  test<float, 0, ext::oneapi::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
+  test<signed char, 0, ext::oneapi::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
 #endif // __cplusplus >= 201402L
 
diff --git a/sycl/test/reduction/reduction_usm.cpp b/sycl/test/reduction/reduction_usm.cpp
index 592a36904a8e8..6915f86af876e 100644
--- a/sycl/test/reduction/reduction_usm.cpp
+++ b/sycl/test/reduction/reduction_usm.cpp
@@ -7,7 +7,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
-// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
+// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and barrier()
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with USM var.
@@ -62,7 +62,7 @@ void test(T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
   // Compute.
   Q.submit([&](handler &CGH) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
-    auto Redu = intel::reduction(ReduVarPtr, Identity, BOp);
+    auto Redu = ext::oneapi::reduction(ReduVarPtr, Identity, BOp);
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
@@ -105,16 +105,16 @@ void testUSM(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
   // fast atomics and fast reduce
-  testUSM<int, 1, intel::plus<int>>(0, 49, 49 * 5);
-  testUSM<int, 0, intel::plus<int>>(0, 8, 128);
+  testUSM<int, 1, ext::oneapi::plus<int>>(0, 49, 49 * 5);
+  testUSM<int, 0, ext::oneapi::plus<int>>(0, 8, 128);
 
   // fast atomics
-  testUSM<int, 0, intel::bit_or<int>>(0, 7, 7 * 3);
-  testUSM<int, 1, intel::bit_or<int>>(0, 4, 128);
+  testUSM<int, 0, ext::oneapi::bit_or<int>>(0, 7, 7 * 3);
+  testUSM<int, 1, ext::oneapi::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  testUSM<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
-  testUSM<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
+  testUSM<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  testUSM<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/regression/sub-group-store-const-ref.cpp b/sycl/test/regression/sub-group-store-const-ref.cpp
index dd10e1d57f12f..b5763f292ea83 100644
--- a/sycl/test/regression/sub-group-store-const-ref.cpp
+++ b/sycl/test/regression/sub-group-store-const-ref.cpp
@@ -13,4 +13,4 @@
 #include <CL/sycl.hpp>
 using namespace sycl;
 
-void test(intel::sub_group sg, global_ptr<int> ptr) { sg.store(ptr, 1); }
+void test(ext::oneapi::sub_group sg, global_ptr<int> ptr) { sg.store(ptr, 1); }
diff --git a/sycl/test/spec_const/spec_const_hw.cpp b/sycl/test/spec_const/spec_const_hw.cpp
index 6e051910ce680..950d4b39dc0bd 100644
--- a/sycl/test/spec_const/spec_const_hw.cpp
+++ b/sycl/test/spec_const/spec_const_hw.cpp
@@ -38,7 +38,7 @@ int val = 10;
 int get_value() { return val; }
 
 float foo(
-    const cl::sycl::experimental::spec_constant<float, MyFloatConst> &f32) {
+    const cl::sycl::ext::oneapi::spec_constant<float, MyFloatConst> &f32) {
   return f32;
 }
 
@@ -69,10 +69,10 @@ int main(int argc, char **argv) {
   // TODO make this floating point once supported by the compiler
   float goldf = (float)get_value();
 
-  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+  cl::sycl::ext::oneapi::spec_constant<int32_t, MyInt32Const> i32 =
       program1.set_spec_constant<MyInt32Const>(goldi);
 
-  cl::sycl::experimental::spec_constant<float, MyFloatConst> f32 =
+  cl::sycl::ext::oneapi::spec_constant<float, MyFloatConst> f32 =
       program2.set_spec_constant<MyFloatConst>(goldf);
 
   program1.build_with_kernel_type<KernelAAAi>();
diff --git a/sycl/test/spec_const/spec_const_neg.cpp b/sycl/test/spec_const/spec_const_neg.cpp
index 18fb8ed5d9d0c..2942146d0b7a3 100644
--- a/sycl/test/spec_const/spec_const_neg.cpp
+++ b/sycl/test/spec_const/spec_const_neg.cpp
@@ -45,7 +45,7 @@ int main(int argc, char **argv) {
             << "\n";
   cl::sycl::program program1(q.get_context());
 
-  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+  cl::sycl::ext::oneapi::spec_constant<int32_t, MyInt32Const> i32 =
       program1.set_spec_constant<MyInt32Const>(10);
 
   std::vector<int> veci(1);
@@ -56,7 +56,7 @@ int main(int argc, char **argv) {
   try {
     // This is an attempt to set a spec constant after the program has been
     // built - spec_const_error should be thrown
-    cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+    cl::sycl::ext::oneapi::spec_constant<int32_t, MyInt32Const> i32 =
         program1.set_spec_constant<MyInt32Const>(10);
 
     cl::sycl::buffer<int, 1> bufi(veci.data(), veci.size());
@@ -69,7 +69,7 @@ int main(int argc, char **argv) {
             acci[0] = i32.get();
           });
     });
-  } catch (cl::sycl::experimental::spec_const_error &sc_err) {
+  } catch (cl::sycl::ext::oneapi::spec_const_error &sc_err) {
     passed = true;
   } catch (cl::sycl::exception &e) {
     std::cout << "*** Exception caught: " << e.what() << "\n";
diff --git a/sycl/test/spec_const/spec_const_redefine.cpp b/sycl/test/spec_const/spec_const_redefine.cpp
index ac65587905bf0..9724ea86d43c8 100644
--- a/sycl/test/spec_const/spec_const_redefine.cpp
+++ b/sycl/test/spec_const/spec_const_redefine.cpp
@@ -71,9 +71,9 @@ int main(int argc, char **argv) {
   for (int i = 0; i < n_sc_sets; i++) {
     cl::sycl::program program(q.get_context());
     const int *sc_set = &sc_vals[i][0];
-    cl::sycl::experimental::spec_constant<int32_t, SC0> sc0 =
+    cl::sycl::ext::oneapi::spec_constant<int32_t, SC0> sc0 =
         program.set_spec_constant<SC0>(sc_set[0]);
-    cl::sycl::experimental::spec_constant<int32_t, SC1> sc1 =
+    cl::sycl::ext::oneapi::spec_constant<int32_t, SC1> sc1 =
         program.set_spec_constant<SC1>(sc_set[1]);
 
     program.build_with_kernel_type<KernelAAA>();
diff --git a/sycl/test/spec_const/spec_const_types.cpp b/sycl/test/spec_const/spec_const_types.cpp
index c7017b2b69726..5abf49eb3c00c 100644
--- a/sycl/test/spec_const/spec_const_types.cpp
+++ b/sycl/test/spec_const/spec_const_types.cpp
@@ -42,49 +42,49 @@ int main() {
   cl::sycl::program program(queue.get_context());
 
   // Create specialization constants.
-  cl::sycl::experimental::spec_constant<bool, MyBoolConst> i1 =
+  cl::sycl::ext::oneapi::spec_constant<bool, MyBoolConst> i1 =
       program.set_spec_constant<MyBoolConst>((bool)get_value());
   // CHECK-DAG: _ZTS11MyBoolConst=1|0
 
-  cl::sycl::experimental::spec_constant<int8_t, MyInt8Const> i8 =
+  cl::sycl::ext::oneapi::spec_constant<int8_t, MyInt8Const> i8 =
       program.set_spec_constant<MyInt8Const>((int8_t)get_value());
   // CHECK-DAG: _ZTS11MyInt8Const=1|1
-  cl::sycl::experimental::spec_constant<uint8_t, MyUInt8Const> ui8 =
+  cl::sycl::ext::oneapi::spec_constant<uint8_t, MyUInt8Const> ui8 =
       program.set_spec_constant<MyUInt8Const>((uint8_t)get_value());
   // CHECK-DAG: _ZTS12MyUInt8Const=1|2
 
-  cl::sycl::experimental::spec_constant<int16_t, MyInt16Const> i16 =
+  cl::sycl::ext::oneapi::spec_constant<int16_t, MyInt16Const> i16 =
       program.set_spec_constant<MyInt16Const>((int16_t)get_value());
   // CHECK-DAG: _ZTS12MyInt16Const=1|3
-  cl::sycl::experimental::spec_constant<uint16_t, MyUInt16Const> ui16 =
+  cl::sycl::ext::oneapi::spec_constant<uint16_t, MyUInt16Const> ui16 =
       program.set_spec_constant<MyUInt16Const>((uint16_t)get_value());
   // CHECK-DAG: _ZTS13MyUInt16Const=1|4
 
-  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+  cl::sycl::ext::oneapi::spec_constant<int32_t, MyInt32Const> i32 =
       program.set_spec_constant<MyInt32Const>((int32_t)get_value());
   // CHECK-DAG: _ZTS12MyInt32Const=1|5
-  cl::sycl::experimental::spec_constant<uint32_t, MyUInt32Const> ui32 =
+  cl::sycl::ext::oneapi::spec_constant<uint32_t, MyUInt32Const> ui32 =
       program.set_spec_constant<MyUInt32Const>((uint32_t)get_value());
   // CHECK-DAG: _ZTS13MyUInt32Const=1|6
 
-  cl::sycl::experimental::spec_constant<int64_t, MyInt64Const> i64 =
+  cl::sycl::ext::oneapi::spec_constant<int64_t, MyInt64Const> i64 =
       program.set_spec_constant<MyInt64Const>((int64_t)get_value());
   // CHECK-DAG: _ZTS12MyInt64Const=1|7
-  cl::sycl::experimental::spec_constant<uint64_t, MyUInt64Const> ui64 =
+  cl::sycl::ext::oneapi::spec_constant<uint64_t, MyUInt64Const> ui64 =
       program.set_spec_constant<MyUInt64Const>((uint64_t)get_value());
   // CHECK-DAG: _ZTS13MyUInt64Const=1|8
 
 #define HALF 0 // TODO not yet supported
 #if HALF
-  cl::sycl::experimental::spec_constant<cl::sycl::half, MyHalfConst> f16 =
+  cl::sycl::ext::oneapi::spec_constant<cl::sycl::half, MyHalfConst> f16 =
       program.set_spec_constant<MyHalfConst>((cl::sycl::half)get_value());
 #endif
 
-  cl::sycl::experimental::spec_constant<float, MyFloatConst> f32 =
+  cl::sycl::ext::oneapi::spec_constant<float, MyFloatConst> f32 =
       program.set_spec_constant<MyFloatConst>((float)get_value());
   // CHECK-DAG: _ZTS12MyFloatConst=1|9
 
-  cl::sycl::experimental::spec_constant<double, MyDoubleConst> f64 =
+  cl::sycl::ext::oneapi::spec_constant<double, MyDoubleConst> f64 =
       program.set_spec_constant<MyDoubleConst>((double)get_value());
   // CHECK-DAG: _ZTS13MyDoubleConst=1|10
 
diff --git a/sycl/test/sub_group/attributes.cpp b/sycl/test/sub_group/attributes.cpp
index d8173d2d1cf72..28c5a99a7fa8b 100644
--- a/sycl/test/sub_group/attributes.cpp
+++ b/sycl/test/sub_group/attributes.cpp
@@ -18,13 +18,13 @@
 
 #include <CL/sycl.hpp>
 
-#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)                                         \
-  class KernelFunctor##SIZE {                                                  \
-  public:                                                                      \
-    [[cl::intel_reqd_sub_group_size(SIZE)]] void                               \
-    operator()(cl::sycl::nd_item<1> Item) {                                    \
-      const auto GID = Item.get_global_id();                                   \
-    }                                                                          \
+#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)           \
+  class KernelFunctor##SIZE {                    \
+  public:                                        \
+    [[cl::intel_reqd_sub_group_size(SIZE)]] void \
+    operator()(cl::sycl::nd_item<1> Item) {      \
+      const auto GID = Item.get_global_id();     \
+    }                                            \
   };
 
 KERNEL_FUNCTOR_WITH_SIZE(1);
@@ -44,7 +44,8 @@ inline uint32_t flp2(uint32_t X) {
   return X - (X >> 1);
 }
 
-template <typename Fn> inline void submit(cl::sycl::queue &Q) {
+template <typename Fn>
+inline void submit(cl::sycl::queue &Q) {
   Q.submit([](cl::sycl::handler &cgh) {
     Fn F;
     cgh.parallel_for(cl::sycl::nd_range<1>{64, 16}, F);
diff --git a/sycl/test/sub_group/barrier.cpp b/sycl/test/sub_group/barrier.cpp
index 25e31cbeb521c..c8306c0cc18df 100644
--- a/sycl/test/sub_group/barrier.cpp
+++ b/sycl/test/sub_group/barrier.cpp
@@ -19,9 +19,11 @@
 #include <CL/sycl.hpp>
 #include <limits>
 #include <numeric>
-template <typename T> class sycl_subgr;
+template <typename T>
+class sycl_subgr;
 using namespace cl::sycl;
-template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+template <typename T>
+void check(queue &Queue, size_t G = 240, size_t L = 60) {
   try {
     nd_range<1> NdRange(G, L);
     std::vector<T> data(G);
@@ -33,7 +35,7 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
       cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         size_t lid = SG.get_local_id().get(0);
         size_t gid = NdItem.get_global_id(0);
         size_t SGoff = gid - lid;
diff --git a/sycl/test/sub_group/broadcast.hpp b/sycl/test/sub_group/broadcast.hpp
index b7c6128cde0c2..9f722dccc3ce7 100644
--- a/sycl/test/sub_group/broadcast.hpp
+++ b/sycl/test/sub_group/broadcast.hpp
@@ -22,7 +22,7 @@ void check(queue &Queue) {
       auto syclacc = syclbuf.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         /*Broadcast GID of element with SGLID == SGID */
         syclacc[NdItem.get_global_id()] =
             broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id());
diff --git a/sycl/test/sub_group/common.cpp b/sycl/test/sub_group/common.cpp
index 17b1a9d8166d8..4863f25a96aab 100644
--- a/sycl/test/sub_group/common.cpp
+++ b/sycl/test/sub_group/common.cpp
@@ -36,7 +36,7 @@ void check(queue &Queue, unsigned int G, unsigned int L) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
       auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
         syclacc[NdItem.get_global_id()].local_range =
             SG.get_local_range().get(0);
diff --git a/sycl/test/sub_group/common_ocl.cpp b/sycl/test/sub_group/common_ocl.cpp
index 232e6c6c11acc..4b4085afb2b24 100644
--- a/sycl/test/sub_group/common_ocl.cpp
+++ b/sycl/test/sub_group/common_ocl.cpp
@@ -64,7 +64,7 @@ void check(queue &Queue, const int G, const int L, const char *SpvFile) {
     Queue.submit([&](handler &cgh) {
       auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
         syclacc[NdItem.get_global_id()].local_range =
             SG.get_local_range().get(0);
diff --git a/sycl/test/sub_group/generic-shuffle.cpp b/sycl/test/sub_group/generic-shuffle.cpp
index d2d7e191dfa32..2f6554f61f82d 100644
--- a/sycl/test/sub_group/generic-shuffle.cpp
+++ b/sycl/test/sub_group/generic-shuffle.cpp
@@ -41,7 +41,7 @@ void check_pointer(queue &Queue, size_t G = 240, size_t L = 60) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
       cgh.parallel_for<pointer_kernel<T>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
         if (wggid == 0)
@@ -127,7 +127,7 @@ void check_struct(queue &Queue, Generator &Gen, size_t G = 240, size_t L = 60) {
       auto in = buf_in.template get_access<access::mode::read>(cgh);
 
       cgh.parallel_for<pointer_kernel<T>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
         if (wggid == 0)
diff --git a/sycl/test/sub_group/helper.hpp b/sycl/test/sub_group/helper.hpp
index a171743b9fe79..bc88372c870ae 100644
--- a/sycl/test/sub_group/helper.hpp
+++ b/sycl/test/sub_group/helper.hpp
@@ -12,12 +12,14 @@
 
 using namespace cl::sycl;
 
-template <typename T1, int N> struct utils {
+template <typename T1, int N>
+struct utils {
   static T1 add_vec(const vec<T1, N> &v);
   static bool cmp_vec(const vec<T1, N> &v, const vec<T1, N> &r);
   static std::string stringify_vec(const vec<T1, N> &v);
 };
-template <typename T2> struct utils<T2, 1> {
+template <typename T2>
+struct utils<T2, 1> {
   static T2 add_vec(const vec<T2, 1> &v) { return v.s0(); }
   static bool cmp_vec(const vec<T2, 1> &v, const vec<T2, 1> &r) {
     return v.s0() == r.s0();
@@ -26,7 +28,8 @@ template <typename T2> struct utils<T2, 1> {
     return std::to_string((T2)v.s0());
   }
 };
-template <typename T2> struct utils<T2, 2> {
+template <typename T2>
+struct utils<T2, 2> {
   static T2 add_vec(const vec<T2, 2> &v) { return v.s0() + v.s1(); }
   static bool cmp_vec(const vec<T2, 2> &v, const vec<T2, 2> &r) {
     return v.s0() == r.s0() && v.s1() == r.s1();
@@ -36,7 +39,8 @@ template <typename T2> struct utils<T2, 2> {
            std::to_string((T2)v.s1()) + " )";
   }
 };
-template <typename T2> struct utils<T2, 4> {
+template <typename T2>
+struct utils<T2, 4> {
   static T2 add_vec(const vec<T2, 4> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3();
   }
@@ -50,7 +54,8 @@ template <typename T2> struct utils<T2, 4> {
            std::to_string((T2)v.s3()) + " )";
   }
 };
-template <typename T2> struct utils<T2, 8> {
+template <typename T2>
+struct utils<T2, 8> {
   static T2 add_vec(const vec<T2, 8> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
            v.s7();
@@ -69,7 +74,8 @@ template <typename T2> struct utils<T2, 8> {
   }
 };
 
-template <typename T2> struct utils<T2, 16> {
+template <typename T2>
+struct utils<T2, 16> {
   static T2 add_vec(const vec<T2, 16> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
            v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() +
@@ -96,7 +102,8 @@ template <typename T2> struct utils<T2, 16> {
   }
 };
 
-template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
+template <typename T>
+void exit_if_not_equal(T val, T ref, const char *name) {
   if (std::is_floating_point<T>::value) {
     if (std::fabs(val - ref) > 0.01) {
       std::cout << "Unexpected result for " << name << ": " << (double)val
@@ -130,9 +137,10 @@ void exit_if_not_equal(T *val, T *ref, const char *name) {
   }
 }
 
-template <> void exit_if_not_equal(half val, half ref, const char *name) {
-  int16_t cmp_val = reinterpret_cast<int16_t&>(val);
-  int16_t cmp_ref = reinterpret_cast<int16_t&>(ref);
+template <>
+void exit_if_not_equal(half val, half ref, const char *name) {
+  int16_t cmp_val = reinterpret_cast<int16_t &>(val);
+  int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
   if (std::abs(cmp_val - cmp_ref) > 1) {
     std::cout << "Unexpected result for " << name << ": " << (float)val
               << " expected value: " << (float)ref << std::endl;
diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp
index 8366e8baca7d9..507b65a8261be 100644
--- a/sycl/test/sub_group/load_store.cpp
+++ b/sycl/test/sub_group/load_store.cpp
@@ -17,11 +17,13 @@
 
 #include "helper.hpp"
 #include <CL/sycl.hpp>
-template <typename T, int N> class sycl_subgr;
+template <typename T, int N>
+class sycl_subgr;
 
 using namespace cl::sycl;
 
-template <typename T, int N> void check(queue &Queue) {
+template <typename T, int N>
+void check(queue &Queue) {
   const int G = 1024, L = 128;
   try {
     nd_range<1> NdRange(G, L);
@@ -40,7 +42,7 @@ template <typename T, int N> void check(queue &Queue) {
       accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
           {L}, cgh);
       cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         if (SG.get_group_id().get(0) % N == 0) {
           size_t SGOffset =
               SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
@@ -94,7 +96,8 @@ template <typename T, int N> void check(queue &Queue) {
     exit(1);
   }
 }
-template <typename T> void check(queue &Queue) {
+template <typename T>
+void check(queue &Queue) {
   const int G = 128, L = 64;
   try {
     nd_range<1> NdRange(G, L);
@@ -114,7 +117,7 @@ template <typename T> void check(queue &Queue) {
       accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
           {L}, cgh);
       cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         if (NdItem.get_global_id(0) == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
         size_t SGOffset =
diff --git a/sycl/test/sub_group/reduce.hpp b/sycl/test/sub_group/reduce.hpp
index 2fd29e30a3081..31fd2c5a459b6 100644
--- a/sycl/test/sub_group/reduce.hpp
+++ b/sycl/test/sub_group/reduce.hpp
@@ -26,7 +26,7 @@ void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
       auto acc = buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<sycl_subgr<T, BinaryOperation>>(
           NdRange, [=](nd_item<1> NdItem) {
-            intel::sub_group sg = NdItem.get_sub_group();
+            ext::oneapi::sub_group sg = NdItem.get_sub_group();
             if (skip_init) {
               acc[NdItem.get_global_id(0)] =
                   reduce(sg, T(NdItem.get_global_id(0)), op);
@@ -73,23 +73,23 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
     L = 32;
   }
 
-  check_op<T>(Queue, T(L), intel::plus<T>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<T>(), true, G, L);
+  check_op<T>(Queue, T(L), ext::oneapi::plus<T>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::plus<T>(), true, G, L);
 
-  check_op<T>(Queue, T(0), intel::minimum<T>(), false, G, L);
-  check_op<T>(Queue, T(G), intel::minimum<T>(), true, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::minimum<T>(), false, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::minimum<T>(), true, G, L);
 
-  check_op<T>(Queue, T(G), intel::maximum<T>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::maximum<T>(), true, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::maximum<T>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::maximum<T>(), true, G, L);
 
 #if __cplusplus >= 201402L
-  check_op<T>(Queue, T(L), intel::plus<>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<>(), true, G, L);
+  check_op<T>(Queue, T(L), ext::oneapi::plus<>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::plus<>(), true, G, L);
 
-  check_op<T>(Queue, T(0), intel::minimum<>(), false, G, L);
-  check_op<T>(Queue, T(G), intel::minimum<>(), true, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::minimum<>(), false, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::minimum<>(), true, G, L);
 
-  check_op<T>(Queue, T(G), intel::maximum<>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::maximum<>(), true, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::maximum<>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::maximum<>(), true, G, L);
 #endif
 }
diff --git a/sycl/test/sub_group/scan.hpp b/sycl/test/sub_group/scan.hpp
index ebb6abda3984d..42c8c373044f9 100644
--- a/sycl/test/sub_group/scan.hpp
+++ b/sycl/test/sub_group/scan.hpp
@@ -28,7 +28,7 @@ void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
       auto inacc = inbuf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<sycl_subgr<T, BinaryOperation>>(
           NdRange, [=](nd_item<1> NdItem) {
-            intel::sub_group sg = NdItem.get_sub_group();
+            ext::oneapi::sub_group sg = NdItem.get_sub_group();
             if (skip_init) {
               exacc[NdItem.get_global_id(0)] =
                   exclusive_scan(sg, T(NdItem.get_global_id(0)), op);
@@ -81,50 +81,50 @@ void check(queue &Queue, size_t G = 120, size_t L = 60) {
     L = 32;
   }
 
-  check_op<T>(Queue, T(L), intel::plus<T>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<T>(), true, G, L);
+  check_op<T>(Queue, T(L), ext::oneapi::plus<T>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::plus<T>(), true, G, L);
 
-  check_op<T>(Queue, T(0), intel::minimum<T>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::minimum<T>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, std::numeric_limits<T>::infinity(), intel::minimum<T>(),
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(), ext::oneapi::minimum<T>(),
                 true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::max(), intel::minimum<T>(), true,
+    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<T>(), true,
                 G, L);
   }
 
-  check_op<T>(Queue, T(G), intel::maximum<T>(), false, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::maximum<T>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), intel::maximum<T>(),
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), ext::oneapi::maximum<T>(),
                 true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::min(), intel::maximum<T>(), true,
+    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<T>(), true,
                 G, L);
   }
 
 #if __cplusplus >= 201402L
-  check_op<T>(Queue, T(L), intel::plus<>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<>(), true, G, L);
+  check_op<T>(Queue, T(L), ext::oneapi::plus<>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::plus<>(), true, G, L);
 
-  check_op<T>(Queue, T(0), intel::minimum<>(), false, G, L);
+  check_op<T>(Queue, T(0), ext::oneapi::minimum<>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, std::numeric_limits<T>::infinity(), intel::minimum<>(),
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(), ext::oneapi::minimum<>(),
                 true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::max(), intel::minimum<>(), true,
+    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<>(), true,
                 G, L);
   }
 
-  check_op<T>(Queue, T(G), intel::maximum<>(), false, G, L);
+  check_op<T>(Queue, T(G), ext::oneapi::maximum<>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), intel::maximum<>(),
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), ext::oneapi::maximum<>(),
                 true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::min(), intel::maximum<>(), true,
+    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<>(), true,
                 G, L);
   }
 #endif
diff --git a/sycl/test/sub_group/shuffle.hpp b/sycl/test/sub_group/shuffle.hpp
index 94c82ab99c2d1..fde4a03e5fde0 100644
--- a/sycl/test/sub_group/shuffle.hpp
+++ b/sycl/test/sub_group/shuffle.hpp
@@ -43,7 +43,7 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
       cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
         vec<T, N> vwggid(wggid), vsgid(sgid);
@@ -150,7 +150,7 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
       cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
         if (wggid == 0)
diff --git a/sycl/test/sub_group/vote.cpp b/sycl/test/sub_group/vote.cpp
index 382266fa412c0..80283f67cdbbe 100644
--- a/sycl/test/sub_group/vote.cpp
+++ b/sycl/test/sub_group/vote.cpp
@@ -49,7 +49,7 @@ void check(queue Queue, const int G, const int L, const int D, const int R) {
       auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
       auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class subgr>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
         /* Set to 1 if any local ID in subgroup devided by D has remainder R */
         if (any_of(SG, SG.get_local_id().get(0) % D == R)) {
           sganyacc[NdItem.get_global_id()] = 1;

From 2444feda7c545399a46f6a99b5915f4e78963fb6 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Tue, 21 Jul 2020 17:08:40 -0400
Subject: [PATCH 03/13] clang-format

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/test/atomic_ref/store.cpp         |  12 +-
 sycl/test/built-ins/vector_integer.cpp | 192 ++++++++++++-------------
 2 files changed, 101 insertions(+), 103 deletions(-)

diff --git a/sycl/test/atomic_ref/store.cpp b/sycl/test/atomic_ref/store.cpp
index 69abba8e0580d..c923fd2997ac7 100644
--- a/sycl/test/atomic_ref/store.cpp
+++ b/sycl/test/atomic_ref/store.cpp
@@ -11,11 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-class store_kernel;
+template <typename T> class store_kernel;
 
-template <typename T>
-void store_test(queue q, size_t N) {
+template <typename T> void store_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
   T store = initial;
   {
@@ -24,7 +22,9 @@ void store_test(queue q, size_t N) {
       auto st = store_buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<store_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(st[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(st[0]);
         atm.store(T(gid));
       });
     });
@@ -55,7 +55,7 @@ int main() {
   store_test<unsigned long long>(q, N);
   store_test<float>(q, N);
   store_test<double>(q, N);
-  //store_test<char*>(q, N);
+  // store_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/built-ins/vector_integer.cpp b/sycl/test/built-ins/vector_integer.cpp
index 74db909d278bb..fea10e39e44e4 100644
--- a/sycl/test/built-ins/vector_integer.cpp
+++ b/sycl/test/built-ins/vector_integer.cpp
@@ -14,14 +14,14 @@ namespace s = cl::sycl;
 int main() {
   // max
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxSI2SI2>([=]() {
-          AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 });
+          AccR[0] = s::max(s::cl_int2{5, 3}, s::cl_int2{2, 7});
         });
       });
     }
@@ -33,14 +33,14 @@ int main() {
 
   // max
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxUI2UI2>([=]() {
-          AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 });
+          AccR[0] = s::max(s::cl_uint2{5, 3}, s::cl_uint2{2, 7});
         });
       });
     }
@@ -52,14 +52,14 @@ int main() {
 
   // max
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxSI2SI1>([=]() {
-          AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int{ 2 });
+          AccR[0] = s::max(s::cl_int2{5, 3}, s::cl_int{2});
         });
       });
     }
@@ -71,14 +71,14 @@ int main() {
 
   // max (longlong2)
   {
-    s::longlong2 r{ 0 };
+    s::longlong2 r{0};
     {
       s::buffer<s::longlong2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxSLL2SLL1>([=]() {
-          AccR[0] = s::max(s::longlong2{ 5, 3 }, s::longlong{ 2 });
+          AccR[0] = s::max(s::longlong2{5, 3}, s::longlong{2});
         });
       });
     }
@@ -90,14 +90,14 @@ int main() {
 
   // max
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxUI2UI1>([=]() {
-          AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 });
+          AccR[0] = s::max(s::cl_uint2{5, 3}, s::cl_uint{2});
         });
       });
     }
@@ -109,14 +109,14 @@ int main() {
 
   // max (ulonglong2)
   {
-    s::ulonglong2 r{ 0 };
+    s::ulonglong2 r{0};
     {
       s::buffer<s::ulonglong2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class maxULL2ULL1>([=]() {
-          AccR[0] = s::max(s::ulonglong2{ 5, 3 }, s::ulonglong{ 2 });
+          AccR[0] = s::max(s::ulonglong2{5, 3}, s::ulonglong{2});
         });
       });
     }
@@ -125,17 +125,17 @@ int main() {
     assert(r1 == 5);
     assert(r2 == 3);
   }
-  
+
   // min
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class minSI2SI2>([=]() {
-          AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 });
+          AccR[0] = s::min(s::cl_int2{5, 3}, s::cl_int2{2, 7});
         });
       });
     }
@@ -147,14 +147,14 @@ int main() {
 
   // min
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class minUI2UI2>([=]() {
-          AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 });
+          AccR[0] = s::min(s::cl_uint2{5, 3}, s::cl_uint2{2, 7});
         });
       });
     }
@@ -166,14 +166,14 @@ int main() {
 
   // min
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class minSI2SI1>([=]() {
-          AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int{ 2 });
+          AccR[0] = s::min(s::cl_int2{5, 3}, s::cl_int{2});
         });
       });
     }
@@ -185,14 +185,14 @@ int main() {
 
   // min
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class minUI2UI1>([=]() {
-          AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 });
+          AccR[0] = s::min(s::cl_uint2{5, 3}, s::cl_uint{2});
         });
       });
     }
@@ -204,14 +204,14 @@ int main() {
 
   // abs
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class absSI2>([=]() {
-          AccR[0] = s::abs(s::cl_int2{ -5, -2 });
+          AccR[0] = s::abs(s::cl_int2{-5, -2});
         });
       });
     }
@@ -223,14 +223,14 @@ int main() {
 
   // abs (longlong)
   {
-    s::ulonglong2 r{ 0 };
+    s::ulonglong2 r{0};
     {
       s::buffer<s::ulonglong2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class absSL2>([=]() {
-          AccR[0] = s::abs(s::longlong2{ -5, -2 });
+          AccR[0] = s::abs(s::longlong2{-5, -2});
         });
       });
     }
@@ -242,14 +242,14 @@ int main() {
 
   // abs_diff
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class abs_diffSI2SI2>([=]() {
-          AccR[0] = s::abs_diff(s::cl_int2{ -5, -2 }, s::cl_int2{ -1, -1 });
+          AccR[0] = s::abs_diff(s::cl_int2{-5, -2}, s::cl_int2{-1, -1});
         });
       });
     }
@@ -261,15 +261,15 @@ int main() {
 
   // add_sat
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class add_satSI2SI2>([=]() {
-          AccR[0] = s::add_sat(s::cl_int2{ 0x7FFFFFFF, 0x7FFFFFFF },
-                               s::cl_int2{ 100, 90 });
+          AccR[0] = s::add_sat(s::cl_int2{0x7FFFFFFF, 0x7FFFFFFF},
+                               s::cl_int2{100, 90});
         });
       });
     }
@@ -281,15 +281,15 @@ int main() {
 
   // hadd
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class haddSI2SI2>([=]() {
-          AccR[0] = s::hadd(s::cl_int2{ 0x0000007F, 0x0000007F },
-                            s::cl_int2{ 0x00000020, 0x00000020 });
+          AccR[0] = s::hadd(s::cl_int2{0x0000007F, 0x0000007F},
+                            s::cl_int2{0x00000020, 0x00000020});
         });
       });
     }
@@ -301,15 +301,15 @@ int main() {
 
   // rhadd
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class rhaddSI2SI2>([=]() {
-          AccR[0] = s::rhadd(s::cl_int2{ 0x0000007F, 0x0000007F },
-                             s::cl_int2{ 0x00000020, 0x00000020 });
+          AccR[0] = s::rhadd(s::cl_int2{0x0000007F, 0x0000007F},
+                             s::cl_int2{0x00000020, 0x00000020});
         });
       });
     }
@@ -321,15 +321,15 @@ int main() {
 
   // clamp - 1
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class clampSI2SI2SI2>([=]() {
-          AccR[0] = s::clamp(s::cl_int2{ 5, 5 }, s::cl_int2{ 10, 10 },
-                             s::cl_int2{ 30, 30 });
+          AccR[0] = s::clamp(s::cl_int2{5, 5}, s::cl_int2{10, 10},
+                             s::cl_int2{30, 30});
         });
       });
     }
@@ -341,15 +341,14 @@ int main() {
 
   // clamp - 2
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class clampSI2SI1SI1>([=]() {
-          AccR[0] =
-              s::clamp(s::cl_int2{ 5, 5 }, s::cl_int{ 10 }, s::cl_int{ 30 });
+          AccR[0] = s::clamp(s::cl_int2{5, 5}, s::cl_int{10}, s::cl_int{30});
         });
       });
     }
@@ -361,14 +360,14 @@ int main() {
 
   // clz
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class clzSI2>([=]() {
-          AccR[0] = s::clz(s::cl_int2{ 0x0FFFFFFF, 0x0FFFFFFF });
+          AccR[0] = s::clz(s::cl_int2{0x0FFFFFFF, 0x0FFFFFFF});
         });
       });
     }
@@ -380,14 +379,14 @@ int main() {
 
   // ctz
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class ctzSI2>([=]() {
-          AccR[0] = s::ext::oneapi::ctz(s::cl_int2{ 0x7FFFFFF0, 0x7FFFFFF0 });
+          AccR[0] = s::ext::oneapi::ctz(s::cl_int2{0x7FFFFFF0, 0x7FFFFFF0});
         });
       });
     }
@@ -399,16 +398,16 @@ int main() {
 
   // mad_hi
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad_hiSI2SI2SI2>([=]() {
-          AccR[0] = s::mad_hi(s::cl_int2{ 0x10000000, 0x10000000 },
-                              s::cl_int2{ 0x00000100, 0x00000100 },
-                              s::cl_int2{ 1, 1 });
+          AccR[0] =
+              s::mad_hi(s::cl_int2{0x10000000, 0x10000000},
+                        s::cl_int2{0x00000100, 0x00000100}, s::cl_int2{1, 1});
         });
       });
     }
@@ -420,16 +419,16 @@ int main() {
 
   // mad_sat
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad_satSI2SI2SI2>([=]() {
-          AccR[0] = s::mad_sat(s::cl_int2{ 0x10000000, 0x10000000 },
-                               s::cl_int2{ 0x00000100, 0x00000100 },
-                               s::cl_int2{ 1, 1 });
+          AccR[0] =
+              s::mad_sat(s::cl_int2{0x10000000, 0x10000000},
+                         s::cl_int2{0x00000100, 0x00000100}, s::cl_int2{1, 1});
         });
       });
     }
@@ -441,15 +440,15 @@ int main() {
 
   // mul_hi
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mul_hiSI2SI2>([=]() {
-          AccR[0] = s::mul_hi(s::cl_int2{ 0x10000000, 0x10000000 },
-                              s::cl_int2{ 0x00000100, 0x00000100 });
+          AccR[0] = s::mul_hi(s::cl_int2{0x10000000, 0x10000000},
+                              s::cl_int2{0x00000100, 0x00000100});
         });
       });
     }
@@ -461,15 +460,15 @@ int main() {
 
   // rotate
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class rotateSI2SI2>([=]() {
-          AccR[0] = s::rotate(s::cl_int2{ 0x11100000, 0x11100000 },
-                              s::cl_int2{ 12, 12 });
+          AccR[0] =
+              s::rotate(s::cl_int2{0x11100000, 0x11100000}, s::cl_int2{12, 12});
         });
       });
     }
@@ -482,50 +481,49 @@ int main() {
   // sub_sat
   {
     auto TestSubSat = [](s::cl_int2 x, s::cl_int2 y) {
-      s::cl_int2 r{ 0 };
+      s::cl_int2 r{0};
       {
         s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
         s::queue myQueue;
         myQueue.submit([&](s::handler &cgh) {
           auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-          cgh.single_task<class sub_satSI2SI2>([=]() {
-            AccR[0] = s::sub_sat(x, y);
-          });
+          cgh.single_task<class sub_satSI2SI2>(
+              [=]() { AccR[0] = s::sub_sat(x, y); });
         });
       }
       return r;
     };
-    s::cl_int2 r1 = TestSubSat(s::cl_int2{ 10, 10 },
-                               s::cl_int2{ 0x80000000, 0x80000000 });
+    s::cl_int2 r1 =
+        TestSubSat(s::cl_int2{10, 10}, s::cl_int2{0x80000000, 0x80000000});
     s::cl_int r1x = r1.x();
     s::cl_int r1y = r1.y();
     assert(r1x == 0x7FFFFFFF);
     assert(r1y == 0x7FFFFFFF);
-    s::cl_int2 r2 = TestSubSat(s::cl_int2{ 0x7FFFFFFF, 0x80000000 },
-                               s::cl_int2{ 0xFFFFFFFF, 0x00000001 });
+    s::cl_int2 r2 = TestSubSat(s::cl_int2{0x7FFFFFFF, 0x80000000},
+                               s::cl_int2{0xFFFFFFFF, 0x00000001});
     s::cl_int r2x = r2.x();
     s::cl_int r2y = r2.y();
     assert(r2x == 0x7FFFFFFF);
     assert(r2y == 0x80000000);
-    s::cl_int2 r3 = TestSubSat(s::cl_int2{ 10499, 30678 },
-                               s::cl_int2{ 30678, 10499 });
+    s::cl_int2 r3 =
+        TestSubSat(s::cl_int2{10499, 30678}, s::cl_int2{30678, 10499});
     s::cl_int r3x = r3.x();
     s::cl_int r3y = r3.y();
     assert(r3x == -20179);
-    assert(r3y ==  20179);
+    assert(r3y == 20179);
   }
 
   // upsample - 1
   {
-    s::cl_ushort2 r{ 0 };
+    s::cl_ushort2 r{0};
     {
       s::buffer<s::cl_ushort2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUC2UC2>([=]() {
-          AccR[0] = s::upsample(s::cl_uchar2{ 0x10, 0x10 },
-                                s::cl_uchar2{ 0x10, 0x10 });
+          AccR[0] =
+              s::upsample(s::cl_uchar2{0x10, 0x10}, s::cl_uchar2{0x10, 0x10});
         });
       });
     }
@@ -537,15 +535,15 @@ int main() {
 
   // upsample - 2
   {
-    s::cl_short2 r{ 0 };
+    s::cl_short2 r{0};
     {
       s::buffer<s::cl_short2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSC2UC2>([=]() {
-          AccR[0] = s::upsample(s::cl_char2{ 0x10, 0x10 },
-                                s::cl_uchar2{ 0x10, 0x10 });
+          AccR[0] =
+              s::upsample(s::cl_char2{0x10, 0x10}, s::cl_uchar2{0x10, 0x10});
         });
       });
     }
@@ -557,15 +555,15 @@ int main() {
 
   // upsample - 3
   {
-    s::cl_uint2 r{ 0 };
+    s::cl_uint2 r{0};
     {
       s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUS2US2>([=]() {
-          AccR[0] = s::upsample(s::cl_ushort2{ 0x0010, 0x0010 },
-                                s::cl_ushort2{ 0x0010, 0x0010 });
+          AccR[0] = s::upsample(s::cl_ushort2{0x0010, 0x0010},
+                                s::cl_ushort2{0x0010, 0x0010});
         });
       });
     }
@@ -577,15 +575,15 @@ int main() {
 
   // upsample - 4
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSS2US2>([=]() {
-          AccR[0] = s::upsample(s::cl_short2{ 0x0010, 0x0010 },
-                                s::cl_ushort2{ 0x0010, 0x0010 });
+          AccR[0] = s::upsample(s::cl_short2{0x0010, 0x0010},
+                                s::cl_ushort2{0x0010, 0x0010});
         });
       });
     }
@@ -597,15 +595,15 @@ int main() {
 
   // upsample - 5
   {
-    s::cl_ulong2 r{ 0 };
+    s::cl_ulong2 r{0};
     {
       s::buffer<s::cl_ulong2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUI2UI2>([=]() {
-          AccR[0] = s::upsample(s::cl_uint2{ 0x00000010, 0x00000010 },
-                                s::cl_uint2{ 0x00000010, 0x00000010 });
+          AccR[0] = s::upsample(s::cl_uint2{0x00000010, 0x00000010},
+                                s::cl_uint2{0x00000010, 0x00000010});
         });
       });
     }
@@ -617,15 +615,15 @@ int main() {
 
   // upsample - 6
   {
-    s::cl_long2 r{ 0 };
+    s::cl_long2 r{0};
     {
       s::buffer<s::cl_long2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSI2UI2>([=]() {
-          AccR[0] = s::upsample(s::cl_int2{ 0x00000010, 0x00000010 },
-                                s::cl_uint2{ 0x00000010, 0x00000010 });
+          AccR[0] = s::upsample(s::cl_int2{0x00000010, 0x00000010},
+                                s::cl_uint2{0x00000010, 0x00000010});
         });
       });
     }
@@ -637,14 +635,14 @@ int main() {
 
   // popcount
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class popcountSI2>([=]() {
-          AccR[0] = s::popcount(s::cl_int2{ 0x000000FF, 0x000000FF });
+          AccR[0] = s::popcount(s::cl_int2{0x000000FF, 0x000000FF});
         });
       });
     }
@@ -656,15 +654,15 @@ int main() {
 
   // mad24
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad24SI2SI2SI2>([=]() {
-          AccR[0] = s::mad24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF },
-                             s::cl_int2{ 20, 20 }, s::cl_int2{ 20, 20 });
+          AccR[0] = s::mad24(s::cl_int2{0xFFFFFFFF, 0xFFFFFFFF},
+                             s::cl_int2{20, 20}, s::cl_int2{20, 20});
         });
       });
     }
@@ -676,15 +674,15 @@ int main() {
 
   // mul24
   {
-    s::cl_int2 r{ 0 };
+    s::cl_int2 r{0};
     {
       s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mul24SI2SI2SI2>([=]() {
-          AccR[0] = s::mul24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF },
-                             s::cl_int2{ 20, 20 });
+          AccR[0] =
+              s::mul24(s::cl_int2{0xFFFFFFFF, 0xFFFFFFFF}, s::cl_int2{20, 20});
         });
       });
     }

From 5e5305a447dc57ca014292cce2af727575927da5 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Wed, 22 Jul 2020 15:41:53 -0400
Subject: [PATCH 04/13] more clang-format

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/include/CL/sycl/ext/oneapi/reduction.hpp |   2 +-
 sycl/include/CL/sycl/handler.hpp              |   4 +-
 sycl/test/atomic_ref/add.cpp                  |  51 +++---
 sycl/test/atomic_ref/compare_exchange.cpp     |  40 +++--
 sycl/test/atomic_ref/exchange.cpp             |  21 ++-
 sycl/test/atomic_ref/load.cpp                 |  18 +-
 sycl/test/atomic_ref/max.cpp                  |  12 +-
 sycl/test/atomic_ref/min.cpp                  |  12 +-
 sycl/test/atomic_ref/sub.cpp                  |  51 +++---
 sycl/test/built-ins/printf.cpp                |  19 +-
 sycl/test/built-ins/scalar_integer.cpp        | 168 ++++++++----------
 sycl/test/enqueue_barrier/enqueue_barrier.cpp |   2 +-
 sycl/test/fpga_tests/fpga_io_pipes.cpp        |   2 +-
 sycl/test/fpga_tests/fpga_pipes.cpp           |  44 ++---
 .../function-pointers/fp-as-kernel-arg.cpp    |  16 +-
 .../pass-fp-through-buffer.cpp                |  11 +-
 sycl/test/reduction/reduction_ctor.cpp        |  50 +++---
 sycl/test/reduction/reduction_nd_ext_type.hpp |  28 +--
 sycl/test/reduction/reduction_nd_s0_dw.cpp    |  18 +-
 sycl/test/reduction/reduction_nd_s0_rw.cpp    |  18 +-
 sycl/test/reduction/reduction_nd_s1_dw.cpp    |  18 +-
 sycl/test/reduction/reduction_nd_s1_rw.cpp    |  18 +-
 sycl/test/reduction/reduction_placeholder.cpp |  18 +-
 sycl/test/reduction/reduction_transparent.cpp |  12 +-
 sycl/test/reduction/reduction_usm.cpp         |  19 +-
 sycl/test/sub_group/attributes.cpp            |  17 +-
 sycl/test/sub_group/barrier.cpp               |   6 +-
 sycl/test/sub_group/helper.hpp                |  33 ++--
 sycl/test/sub_group/load_store.cpp            |   9 +-
 sycl/test/sub_group/scan.hpp                  |  38 ++--
 30 files changed, 387 insertions(+), 388 deletions(-)

diff --git a/sycl/include/CL/sycl/ext/oneapi/reduction.hpp b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
index 67ced2af7bac3..0da27927f1efb 100644
--- a/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/reduction.hpp
@@ -9,8 +9,8 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
-#include <CL/sycl/handler.hpp>
 #include <CL/sycl/ext/oneapi/group_algorithm.hpp>
+#include <CL/sycl/handler.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 9011ba24d4c00..1a09b34825969 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -346,7 +346,7 @@ class __SYCL_EXPORT handler {
   // Recursively calls itself until arguments pack is fully processed.
   // The version for regular(standard layout) argument.
   template <typename T, typename... Ts>
-  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
+  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
     set_arg(ArgIndex, std::move(Arg));
     setArgsHelper(++ArgIndex, std::move(Args)...);
   }
@@ -813,7 +813,7 @@ class __SYCL_EXPORT handler {
   /// Registers pack of arguments(Args) with indexes starting from 0.
   ///
   /// \param Args are argument values to be set.
-  template <typename... Ts> void set_args(Ts &&...Args) {
+  template <typename... Ts> void set_args(Ts &&... Args) {
     setArgsHelper(0, std::move(Args)...);
   }
 
diff --git a/sycl/test/atomic_ref/add.cpp b/sycl/test/atomic_ref/add.cpp
index 6face7c749bf5..6f4b86640fc53 100644
--- a/sycl/test/atomic_ref/add.cpp
+++ b/sycl/test/atomic_ref/add.cpp
@@ -12,8 +12,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-void add_fetch_test(queue q, size_t N) {
+template <typename T> void add_fetch_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -23,10 +22,13 @@ void add_fetch_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
         out[gid] = atm.fetch_add(T(1));
       });
     });
@@ -45,8 +47,7 @@ void add_fetch_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void add_plus_equal_test(queue q, size_t N) {
+template <typename T> void add_plus_equal_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -56,10 +57,13 @@ void add_plus_equal_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
         out[gid] = atm += T(1);
       });
     });
@@ -78,8 +82,7 @@ void add_plus_equal_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void add_pre_inc_test(queue q, size_t N) {
+template <typename T> void add_pre_inc_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -89,10 +92,13 @@ void add_pre_inc_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
         out[gid] = ++atm;
       });
     });
@@ -111,8 +117,7 @@ void add_pre_inc_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void add_post_inc_test(queue q, size_t N) {
+template <typename T> void add_post_inc_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -122,10 +127,13 @@ void add_post_inc_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
         out[gid] = atm++;
       });
     });
@@ -144,8 +152,7 @@ void add_post_inc_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void add_test(queue q, size_t N) {
+template <typename T> void add_test(queue q, size_t N) {
   add_fetch_test<T>(q, N);
   add_plus_equal_test<T>(q, N);
   add_pre_inc_test<T>(q, N);
@@ -153,13 +160,11 @@ void add_test(queue q, size_t N) {
 }
 
 // Floating-point types do not support pre- or post-increment
-template <>
-void add_test<float>(queue q, size_t N) {
+template <> void add_test<float>(queue q, size_t N) {
   add_fetch_test<float>(q, N);
   add_plus_equal_test<float>(q, N);
 }
-template <>
-void add_test<double>(queue q, size_t N) {
+template <> void add_test<double>(queue q, size_t N) {
   add_fetch_test<double>(q, N);
   add_plus_equal_test<double>(q, N);
 }
@@ -183,7 +188,7 @@ int main() {
   add_test<unsigned long long>(q, N);
   add_test<float>(q, N);
   add_test<double>(q, N);
-  //add_test<char*>(q, N);
+  // add_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/compare_exchange.cpp b/sycl/test/atomic_ref/compare_exchange.cpp
index 31290418a144b..5660fa64882cf 100644
--- a/sycl/test/atomic_ref/compare_exchange.cpp
+++ b/sycl/test/atomic_ref/compare_exchange.cpp
@@ -11,11 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-class compare_exchange_kernel;
+template <typename T> class compare_exchange_kernel;
 
-template <typename T>
-void compare_exchange_test(queue q, size_t N) {
+template <typename T> void compare_exchange_test(queue q, size_t N) {
   const T initial = std::numeric_limits<T>::max();
   T compare_exchange = initial;
   std::vector<T> output(N);
@@ -25,19 +23,25 @@ void compare_exchange_test(queue q, size_t N) {
     buffer<T> output_buf(output.data(), output.size());
 
     q.submit([&](handler &cgh) {
-      auto exc = compare_exchange_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for<compare_exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(exc[0]);
-        T result = initial;
-        bool success = atm.compare_exchange_strong(result, (T)gid);
-        if (success) {
-          out[gid] = result;
-        } else {
-          out[gid] = gid;
-        }
-      });
+      auto exc =
+          compare_exchange_buf.template get_access<access::mode::read_write>(
+              cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<compare_exchange_kernel<T>>(
+          range<1>(N), [=](item<1> it) {
+            int gid = it.get_id(0);
+            auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                                  ext::oneapi::memory_scope::device,
+                                  access::address_space::global_space>(exc[0]);
+            T result = initial;
+            bool success = atm.compare_exchange_strong(result, (T)gid);
+            if (success) {
+              out[gid] = result;
+            } else {
+              out[gid] = gid;
+            }
+          });
     });
   }
 
@@ -69,7 +73,7 @@ int main() {
   compare_exchange_test<unsigned long long>(q, N);
   compare_exchange_test<float>(q, N);
   compare_exchange_test<double>(q, N);
-  //compare_exchange_test<char*>(q, N);
+  // compare_exchange_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/exchange.cpp b/sycl/test/atomic_ref/exchange.cpp
index bba5dae8e29b3..3600bb36c3700 100644
--- a/sycl/test/atomic_ref/exchange.cpp
+++ b/sycl/test/atomic_ref/exchange.cpp
@@ -11,11 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-class exchange_kernel;
+template <typename T> class exchange_kernel;
 
-template <typename T>
-void exchange_test(queue q, size_t N) {
+template <typename T> void exchange_test(queue q, size_t N) {
   const T initial = std::numeric_limits<T>::max();
   T exchange = initial;
   std::vector<T> output(N);
@@ -25,11 +23,15 @@ void exchange_test(queue q, size_t N) {
     buffer<T> output_buf(output.data(), output.size());
 
     q.submit([&](handler &cgh) {
-      auto exc = exchange_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto exc =
+          exchange_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(exc[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(exc[0]);
         out[gid] = atm.exchange(gid);
       });
     });
@@ -38,7 +40,8 @@ void exchange_test(queue q, size_t N) {
   // Only one work-item should have received the initial sentinel value
   assert(std::count(output.begin(), output.end(), initial) == 1);
 
-  // All other values should be unique; each work-item replaces the value it reads with its own ID
+  // All other values should be unique; each work-item replaces the value it
+  // reads with its own ID
   std::sort(output.begin(), output.end());
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
@@ -62,7 +65,7 @@ int main() {
   exchange_test<unsigned long long>(q, N);
   exchange_test<float>(q, N);
   exchange_test<double>(q, N);
-  //exchange_test<char*>(q, N);
+  // exchange_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/load.cpp b/sycl/test/atomic_ref/load.cpp
index 4d95c4a5f1858..2bb3cf45d749c 100644
--- a/sycl/test/atomic_ref/load.cpp
+++ b/sycl/test/atomic_ref/load.cpp
@@ -11,11 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-class load_kernel;
+template <typename T> class load_kernel;
 
-template <typename T>
-void load_test(queue q, size_t N) {
+template <typename T> void load_test(queue q, size_t N) {
   T initial = 42;
   T load = initial;
   std::vector<T> output(N);
@@ -26,10 +24,13 @@ void load_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<load_kernel<T>>(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(ld[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(ld[0]);
         out[gid] = atm.load();
       });
     });
@@ -37,7 +38,8 @@ void load_test(queue q, size_t N) {
 
   // All work-items should read the same value
   // Atomicity isn't tested here, but support for load() is
-  assert(std::all_of(output.begin(), output.end(), [&](T x) { return (x == initial); }));
+  assert(std::all_of(output.begin(), output.end(),
+                     [&](T x) { return (x == initial); }));
 }
 
 int main() {
@@ -59,7 +61,7 @@ int main() {
   load_test<unsigned long long>(q, N);
   load_test<float>(q, N);
   load_test<double>(q, N);
-  //load_test<char*>(q, N);
+  // load_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/max.cpp b/sycl/test/atomic_ref/max.cpp
index 7be6b9ac392b5..d1c326237b009 100644
--- a/sycl/test/atomic_ref/max.cpp
+++ b/sycl/test/atomic_ref/max.cpp
@@ -12,8 +12,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-void max_test(queue q, size_t N) {
+template <typename T> void max_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
   T val = initial;
   std::vector<T> output(N);
@@ -24,10 +23,13 @@ void max_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
 
         // +1 accounts for lowest() returning 0 for unsigned types
         out[gid] = atm.fetch_max(T(gid) + 1);
@@ -67,7 +69,7 @@ int main() {
   max_test<unsigned long long>(q, N);
   max_test<float>(q, N);
   max_test<double>(q, N);
-  //max_test<char*>(q, N);
+  // max_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/min.cpp b/sycl/test/atomic_ref/min.cpp
index 47787a52b2eea..1066370fcf4e3 100644
--- a/sycl/test/atomic_ref/min.cpp
+++ b/sycl/test/atomic_ref/min.cpp
@@ -12,8 +12,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-void min_test(queue q, size_t N) {
+template <typename T> void min_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
   T val = initial;
   std::vector<T> output(N);
@@ -24,10 +23,13 @@ void min_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_min(T(gid));
       });
     });
@@ -65,7 +67,7 @@ int main() {
   min_test<unsigned long long>(q, N);
   min_test<float>(q, N);
   min_test<double>(q, N);
-  //min_test<char*>(q, N);
+  // min_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/atomic_ref/sub.cpp b/sycl/test/atomic_ref/sub.cpp
index 13ed2c5bdafbe..5296b41ddc5e5 100644
--- a/sycl/test/atomic_ref/sub.cpp
+++ b/sycl/test/atomic_ref/sub.cpp
@@ -12,8 +12,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T>
-void sub_fetch_test(queue q, size_t N) {
+template <typename T> void sub_fetch_test(queue q, size_t N) {
   T val = N;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -23,10 +22,13 @@ void sub_fetch_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_sub(T(1));
       });
     });
@@ -45,8 +47,7 @@ void sub_fetch_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void sub_plus_equal_test(queue q, size_t N) {
+template <typename T> void sub_plus_equal_test(queue q, size_t N) {
   T val = N;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -56,10 +57,13 @@ void sub_plus_equal_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
         out[gid] = atm -= T(1);
       });
     });
@@ -78,8 +82,7 @@ void sub_plus_equal_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void sub_pre_dec_test(queue q, size_t N) {
+template <typename T> void sub_pre_dec_test(queue q, size_t N) {
   T val = N;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -89,10 +92,13 @@ void sub_pre_dec_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
         out[gid] = --atm;
       });
     });
@@ -111,8 +117,7 @@ void sub_pre_dec_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void sub_post_dec_test(queue q, size_t N) {
+template <typename T> void sub_post_dec_test(queue q, size_t N) {
   T val = N;
   std::vector<T> output(N);
   std::fill(output.begin(), output.end(), 0);
@@ -122,10 +127,13 @@ void sub_post_dec_test(queue q, size_t N) {
 
     q.submit([&](handler &cgh) {
       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device, access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device,
+                              access::address_space::global_space>(val[0]);
         out[gid] = atm--;
       });
     });
@@ -144,8 +152,7 @@ void sub_post_dec_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T>
-void sub_test(queue q, size_t N) {
+template <typename T> void sub_test(queue q, size_t N) {
   sub_fetch_test<T>(q, N);
   sub_plus_equal_test<T>(q, N);
   sub_pre_dec_test<T>(q, N);
@@ -153,13 +160,11 @@ void sub_test(queue q, size_t N) {
 }
 
 // Floating-point types do not support pre- or post-decrement
-template <>
-void sub_test<float>(queue q, size_t N) {
+template <> void sub_test<float>(queue q, size_t N) {
   sub_fetch_test<float>(q, N);
   sub_plus_equal_test<float>(q, N);
 }
-template <>
-void sub_test<double>(queue q, size_t N) {
+template <> void sub_test<double>(queue q, size_t N) {
   sub_fetch_test<double>(q, N);
   sub_plus_equal_test<double>(q, N);
 }
@@ -183,7 +188,7 @@ int main() {
   sub_test<unsigned long long>(q, N);
   sub_test<float>(q, N);
   sub_test<double>(q, N);
-  //sub_test<char*>(q, N);
+  // sub_test<char*>(q, N);
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/sycl/test/built-ins/printf.cpp b/sycl/test/built-ins/printf.cpp
index fe1a416b9550e..26a7bde2451b3 100644
--- a/sycl/test/built-ins/printf.cpp
+++ b/sycl/test/built-ins/printf.cpp
@@ -78,18 +78,15 @@ int main() {
 
         // However, you are still able to print them by-element:
         {
-          ext::oneapi::printf(format_vec, (int32_t)v4.w(),
-                                      (int32_t)v4.z(), (int32_t)v4.y(),
-                                      (int32_t)v4.x());
+          ext::oneapi::printf(format_vec, (int32_t)v4.w(), (int32_t)v4.z(),
+                              (int32_t)v4.y(), (int32_t)v4.x());
         }
 #else
         // On host side you always have to print them by-element:
-        ext::oneapi::printf(format_vec, (int32_t)v4.x(),
-                                    (int32_t)v4.y(), (int32_t)v4.z(),
-                                    (int32_t)v4.w());
-        ext::oneapi::printf(format_vec, (int32_t)v4.w(),
-                                    (int32_t)v4.z(), (int32_t)v4.y(),
-                                    (int32_t)v4.x());
+        ext::oneapi::printf(format_vec, (int32_t)v4.x(), (int32_t)v4.y(),
+                            (int32_t)v4.z(), (int32_t)v4.w());
+        ext::oneapi::printf(format_vec, (int32_t)v4.w(), (int32_t)v4.z(),
+                            (int32_t)v4.y(), (int32_t)v4.x());
 #endif // __SYCL_DEVICE_ONLY__
        // CHECK-NEXT: 5,6,7,8
        // CHECK-NEXT: 8,7,6,5
@@ -127,8 +124,8 @@ int main() {
     // CHECK-NEXT: {{[0-9]+}}: Hello, World!
   }
 
-// FIXME: strictly check output order once the bug mentioned above is fixed
-// CHECK: {{(Hello, World!)?}}
+  // FIXME: strictly check output order once the bug mentioned above is fixed
+  // CHECK: {{(Hello, World!)?}}
 
   return 0;
 }
diff --git a/sycl/test/built-ins/scalar_integer.cpp b/sycl/test/built-ins/scalar_integer.cpp
index 85a0eae294d5f..3151fb3652955 100644
--- a/sycl/test/built-ins/scalar_integer.cpp
+++ b/sycl/test/built-ins/scalar_integer.cpp
@@ -14,15 +14,14 @@ namespace s = cl::sycl;
 int main() {
   // max
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class maxSI1SI1>([=]() {
-          AccR[0] = s::max(s::cl_int{ 5 }, s::cl_int{ 2 });
-        });
+        cgh.single_task<class maxSI1SI1>(
+            [=]() { AccR[0] = s::max(s::cl_int{5}, s::cl_int{2}); });
       });
     }
     assert(r == 5);
@@ -30,15 +29,14 @@ int main() {
 
   // max
   {
-    s::cl_uint r{ 0 };
+    s::cl_uint r{0};
     {
       s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class maxUI1UI1>([=]() {
-          AccR[0] = s::max(s::cl_uint{ 5 }, s::cl_uint{ 2 });
-        });
+        cgh.single_task<class maxUI1UI1>(
+            [=]() { AccR[0] = s::max(s::cl_uint{5}, s::cl_uint{2}); });
       });
     }
     assert(r == 5);
@@ -46,15 +44,14 @@ int main() {
 
   // min
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class minSI1SI1>([=]() {
-          AccR[0] = s::min(s::cl_int{ 5 }, s::cl_int{ 2 });
-        });
+        cgh.single_task<class minSI1SI1>(
+            [=]() { AccR[0] = s::min(s::cl_int{5}, s::cl_int{2}); });
       });
     }
     assert(r == 2);
@@ -62,15 +59,14 @@ int main() {
 
   // min (longlong)
   {
-    s::longlong r{ 0 };
+    s::longlong r{0};
     {
       s::buffer<s::longlong, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class minSLL1SLL1>([=]() {
-          AccR[0] = s::min(s::longlong{ 5 }, s::longlong{ 2 });
-        });
+        cgh.single_task<class minSLL1SLL1>(
+            [=]() { AccR[0] = s::min(s::longlong{5}, s::longlong{2}); });
       });
     }
     assert(r == 2);
@@ -78,15 +74,14 @@ int main() {
 
   // min
   {
-    s::cl_uint r{ 0 };
+    s::cl_uint r{0};
     {
       s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class minUI1UI1>([=]() {
-          AccR[0] = s::min(s::cl_uint{ 5 }, s::cl_uint{ 2 });
-        });
+        cgh.single_task<class minUI1UI1>(
+            [=]() { AccR[0] = s::min(s::cl_uint{5}, s::cl_uint{2}); });
       });
     }
     assert(r == 2);
@@ -94,15 +89,14 @@ int main() {
 
   // min (ulonglong)
   {
-    s::ulonglong r{ 0 };
+    s::ulonglong r{0};
     {
       s::buffer<s::ulonglong, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class minULL1ULL1>([=]() {
-          AccR[0] = s::min(s::ulonglong{ 5 }, s::ulonglong{ 2 });
-        });
+        cgh.single_task<class minULL1ULL1>(
+            [=]() { AccR[0] = s::min(s::ulonglong{5}, s::ulonglong{2}); });
       });
     }
     assert(r == 2);
@@ -110,15 +104,14 @@ int main() {
 
   // abs
   {
-    s::cl_uint r{ 0 };
+    s::cl_uint r{0};
     {
       s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class absSI1>([=]() {
-          AccR[0] = s::abs(s::cl_int{ -5 });
-        });
+        cgh.single_task<class absSI1>(
+            [=]() { AccR[0] = s::abs(s::cl_int{-5}); });
       });
     }
     assert(r == 5);
@@ -126,15 +119,14 @@ int main() {
 
   // abs_diff
   {
-    s::cl_uint r{ 0 };
+    s::cl_uint r{0};
     {
       s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class abs_diffSI1SI1>([=]() {
-          AccR[0] = s::abs_diff(s::cl_int{ -5 }, s::cl_int{ -1 });
-        });
+        cgh.single_task<class abs_diffSI1SI1>(
+            [=]() { AccR[0] = s::abs_diff(s::cl_int{-5}, s::cl_int{-1}); });
       });
     }
     assert(r == 4);
@@ -142,15 +134,14 @@ int main() {
 
   // abs_diff(uchar)
   {
-    s::cl_uchar r{ 0 };
+    s::cl_uchar r{0};
     {
       s::buffer<s::cl_uchar, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class abs_diffUC1UC1>([=]() {
-          AccR[0] = s::abs_diff(s::uchar{ 3 }, s::uchar{ 250 });
-        });
+        cgh.single_task<class abs_diffUC1UC1>(
+            [=]() { AccR[0] = s::abs_diff(s::uchar{3}, s::uchar{250}); });
       });
     }
     assert(r == 247);
@@ -158,14 +149,14 @@ int main() {
 
   // add_sat
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class add_satSI1SI1>([=]() {
-          AccR[0] = s::add_sat(s::cl_int{ 0x7FFFFFFF }, s::cl_int{ 100 });
+          AccR[0] = s::add_sat(s::cl_int{0x7FFFFFFF}, s::cl_int{100});
         });
       });
     }
@@ -174,14 +165,14 @@ int main() {
 
   // hadd
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class haddSI1SI1>([=]() {
-          AccR[0] = s::hadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 });
+          AccR[0] = s::hadd(s::cl_int{0x0000007F}, s::cl_int{0x00000020});
         });
       });
     }
@@ -190,14 +181,14 @@ int main() {
 
   // rhadd
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class rhaddSI1SI1>([=]() {
-          AccR[0] = s::rhadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 });
+          AccR[0] = s::rhadd(s::cl_int{0x0000007F}, s::cl_int{0x00000020});
         });
       });
     }
@@ -206,14 +197,14 @@ int main() {
 
   // clamp
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class clampSI1SI1SI1>([=]() {
-          AccR[0] = s::clamp(s::cl_int{ 5 }, s::cl_int{ 10 }, s::cl_int{ 30 });
+          AccR[0] = s::clamp(s::cl_int{5}, s::cl_int{10}, s::cl_int{30});
         });
       });
     }
@@ -222,15 +213,14 @@ int main() {
 
   // clz
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class clzSI1>([=]() {
-          AccR[0] = s::clz(s::cl_int{ 0x0FFFFFFF });
-        });
+        cgh.single_task<class clzSI1>(
+            [=]() { AccR[0] = s::clz(s::cl_int{0x0FFFFFFF}); });
       });
     }
     assert(r == 4);
@@ -238,15 +228,14 @@ int main() {
 
   // ctz
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class ctzSI1>([=]() {
-          AccR[0] = s::ext::oneapi::ctz(s::cl_int{ 0x7FFFFFF0 });
-        });
+        cgh.single_task<class ctzSI1>(
+            [=]() { AccR[0] = s::ext::oneapi::ctz(s::cl_int{0x7FFFFFF0}); });
       });
     }
     assert(r == 4);
@@ -254,15 +243,15 @@ int main() {
 
   // mad_hi
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad_hiSI1SI1SI1>([=]() {
-          AccR[0] = s::mad_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 },
-                              s::cl_int{ 0x00000001 });
+          AccR[0] = s::mad_hi(s::cl_int{0x10000000}, s::cl_int{0x00000100},
+                              s::cl_int{0x00000001});
         }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000.
       });
     }
@@ -271,15 +260,15 @@ int main() {
 
   // mad_sat
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad_satSI1SI1SI1>([=]() {
-          AccR[0] = s::mad_sat(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 },
-                               s::cl_int{ 0x00000001 });
+          AccR[0] = s::mad_sat(s::cl_int{0x10000000}, s::cl_int{0x00000100},
+                               s::cl_int{0x00000001});
         }); // 2^31 * 2^8 = 2^39 -> 0x80 00000000 -> reuslt is saturated in the
             // product.
       });
@@ -305,20 +294,19 @@ int main() {
       });
     }
     assert(r == exp); // Should return the real number of i0*i1+i2 in CPU
-                              // Only fails in vector, but passes in scalar.
-
+                      // Only fails in vector, but passes in scalar.
   }
 
   // mul_hi
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mul_hiSI1SI1>([=]() {
-          AccR[0] = s::mul_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 });
+          AccR[0] = s::mul_hi(s::cl_int{0x10000000}, s::cl_int{0x00000100});
         }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000.
       });
     }
@@ -360,14 +348,14 @@ int main() {
 
   // rotate
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class rotateSI1SI1>([=]() {
-          AccR[0] = s::rotate(s::cl_int{ 0x11100000 }, s::cl_int{ 12 });
+          AccR[0] = s::rotate(s::cl_int{0x11100000}, s::cl_int{12});
         });
       });
     }
@@ -376,7 +364,7 @@ int main() {
 
   // rotate (with large rotate size)
   {
-    s::cl_char r{ 0 };
+    s::cl_char r{0};
     {
       s::buffer<s::cl_char, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
@@ -384,7 +372,7 @@ int main() {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class rotateSI1SI2>([=]() {
           AccR[0] = s::rotate(static_cast<s::cl_char>((unsigned char)0xe0),
-              s::cl_char{ 50 });
+                              s::cl_char{50});
         });
       });
     }
@@ -393,15 +381,14 @@ int main() {
   // sub_sat
   {
     auto TestSubSat = [](s::cl_int x, s::cl_int y) {
-      s::cl_int r{ 0 };
+      s::cl_int r{0};
       {
         s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
         s::queue myQueue;
         myQueue.submit([&](s::handler &cgh) {
           auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-          cgh.single_task<class sub_satSI1SI1>([=]() {
-            AccR[0] = s::sub_sat(x, y);
-          });
+          cgh.single_task<class sub_satSI1SI1>(
+              [=]() { AccR[0] = s::sub_sat(x, y); });
         });
       }
       return r;
@@ -419,14 +406,14 @@ int main() {
 
   // upsample - 1
   {
-    s::cl_ushort r{ 0 };
+    s::cl_ushort r{0};
     {
       s::buffer<s::cl_ushort, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUC1UC1>([=]() {
-          AccR[0] = s::upsample(s::cl_uchar{ 0x10 }, s::cl_uchar{ 0x10 });
+          AccR[0] = s::upsample(s::cl_uchar{0x10}, s::cl_uchar{0x10});
         });
       });
     }
@@ -435,14 +422,14 @@ int main() {
 
   // upsample - 2
   {
-    s::cl_short r{ 0 };
+    s::cl_short r{0};
     {
       s::buffer<s::cl_short, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSC1UC1>([=]() {
-          AccR[0] = s::upsample(s::cl_char{ 0x10 }, s::cl_uchar{ 0x10 });
+          AccR[0] = s::upsample(s::cl_char{0x10}, s::cl_uchar{0x10});
         });
       });
     }
@@ -451,14 +438,14 @@ int main() {
 
   // upsample - 3
   {
-    s::cl_uint r{ 0 };
+    s::cl_uint r{0};
     {
       s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUS1US1>([=]() {
-          AccR[0] = s::upsample(s::cl_ushort{ 0x0010 }, s::cl_ushort{ 0x0010 });
+          AccR[0] = s::upsample(s::cl_ushort{0x0010}, s::cl_ushort{0x0010});
         });
       });
     }
@@ -467,14 +454,14 @@ int main() {
 
   // upsample - 4
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSS1US1>([=]() {
-          AccR[0] = s::upsample(s::cl_short{ 0x0010 }, s::cl_ushort{ 0x0010 });
+          AccR[0] = s::upsample(s::cl_short{0x0010}, s::cl_ushort{0x0010});
         });
       });
     }
@@ -483,15 +470,14 @@ int main() {
 
   // upsample - 5
   {
-    s::cl_ulong r{ 0 };
+    s::cl_ulong r{0};
     {
       s::buffer<s::cl_ulong, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleUI1UI1>([=]() {
-          AccR[0] =
-              s::upsample(s::cl_uint{ 0x00000010 }, s::cl_uint{ 0x00000010 });
+          AccR[0] = s::upsample(s::cl_uint{0x00000010}, s::cl_uint{0x00000010});
         });
       });
     }
@@ -500,15 +486,14 @@ int main() {
 
   // upsample - 6
   {
-    s::cl_long r{ 0 };
+    s::cl_long r{0};
     {
       s::buffer<s::cl_long, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSI1UI1>([=]() {
-          AccR[0] =
-              s::upsample(s::cl_int{ 0x00000010 }, s::cl_uint{ 0x00000010 });
+          AccR[0] = s::upsample(s::cl_int{0x00000010}, s::cl_uint{0x00000010});
         });
       });
     }
@@ -517,15 +502,14 @@ int main() {
 
   // popcount
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
-        cgh.single_task<class popcountSI1>([=]() {
-          AccR[0] = s::popcount(s::cl_int{ 0x000000FF });
-        });
+        cgh.single_task<class popcountSI1>(
+            [=]() { AccR[0] = s::popcount(s::cl_int{0x000000FF}); });
       });
     }
     assert(r == 8);
@@ -533,7 +517,7 @@ int main() {
 
   // mad24
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
@@ -541,7 +525,7 @@ int main() {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mad24SI1SI1SI1>([=]() {
           AccR[0] =
-              s::mad24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 }, s::cl_int{ 20 });
+              s::mad24(s::cl_int(0xFFFFFFFF), s::cl_int{20}, s::cl_int{20});
         });
       });
     }
@@ -550,14 +534,14 @@ int main() {
 
   // mul24
   {
-    s::cl_int r{ 0 };
+    s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
       s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class mul24SI1SI1>([=]() {
-          AccR[0] = s::mul24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 });
+          AccR[0] = s::mul24(s::cl_int(0xFFFFFFFF), s::cl_int{20});
         });
       });
     }
diff --git a/sycl/test/enqueue_barrier/enqueue_barrier.cpp b/sycl/test/enqueue_barrier/enqueue_barrier.cpp
index b84660d58b467..f8bd5a9e8523c 100644
--- a/sycl/test/enqueue_barrier/enqueue_barrier.cpp
+++ b/sycl/test/enqueue_barrier/enqueue_barrier.cpp
@@ -6,7 +6,7 @@
 // UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/fpga_device_selector.hpp>
+#include <CL/sycl/ext/intel/fpga_device_selector.hpp>
 
 int main() {
   sycl::context Context;
diff --git a/sycl/test/fpga_tests/fpga_io_pipes.cpp b/sycl/test/fpga_tests/fpga_io_pipes.cpp
index 989e390389418..20f3c489bded0 100644
--- a/sycl/test/fpga_tests/fpga_io_pipes.cpp
+++ b/sycl/test/fpga_tests/fpga_io_pipes.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <CL/sycl/ext/intel/fpga_extensions.hpp>
 #include <fstream>
 #include <iostream>
 
diff --git a/sycl/test/fpga_tests/fpga_pipes.cpp b/sycl/test/fpga_tests/fpga_pipes.cpp
index 1eba903d9972e..e3496e0168878 100644
--- a/sycl/test/fpga_tests/fpga_pipes.cpp
+++ b/sycl/test/fpga_tests/fpga_pipes.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <CL/sycl/ext/intel/fpga_extensions.hpp>
 #include <iostream>
 
 // Size of an array passing through a pipe
@@ -27,11 +27,10 @@ class nb_pipe;
 }
 
 // For non-blocking template pipes
-template<int N>
-class templ_nb_pipe;
+template <int N> class templ_nb_pipe;
 
 // For non-blocking multiple pipes
-template<int N>
+template <int N>
 using PipeMulNb = cl::sycl::intel::pipe<class templ_nb_pipe<N>, int>;
 
 // For simple blocking pipes with explicit type
@@ -43,25 +42,22 @@ class bl_pipe;
 }
 
 // For blocking template pipes
-template<int N>
-class templ_bl_pipe;
+template <int N> class templ_bl_pipe;
 
 // For blocking multiple pipes
-template<int N>
-using PipeMulBl = cl::sycl::intel::pipe<class templ_bl_pipe<N>, int>;
+template <int N>
+using PipeMulBl = cl::sycl::ext::intel::pipe<class templ_bl_pipe<N>, int>;
 
 // Kernel names
-template <int TestNumber, int KernelNumber = 0>
-class writer;
-template <int TestNumber, int KernelNumber = 0>
-class reader;
+template <int TestNumber, int KernelNumber = 0> class writer;
+template <int TestNumber, int KernelNumber = 0> class reader;
 
 // Test for simple non-blocking pipes
-template<typename PipeName, int TestNumber>
+template <typename PipeName, int TestNumber>
 int test_simple_nb_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::intel::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -97,8 +93,7 @@ int test_simple_nb_pipe(cl::sycl::queue Queue) {
 }
 
 // Test for multiple non-blocking pipes
-template<int TestNumber>
-int test_multiple_nb_pipe(cl::sycl::queue Queue) {
+template <int TestNumber> int test_multiple_nb_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -149,10 +144,9 @@ int test_multiple_nb_pipe(cl::sycl::queue Queue) {
 }
 
 // Test for array passing through a non-blocking pipe
-template<int TestNumber>
-int test_array_th_nb_pipe(cl::sycl::queue Queue) {
+template <int TestNumber> int test_array_th_nb_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherNbPipe = cl::sycl::intel::pipe<class another_nb_pipe, int>;
+  using AnotherNbPipe = cl::sycl::ext::intel::pipe<class another_nb_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {
@@ -190,11 +184,11 @@ int test_array_th_nb_pipe(cl::sycl::queue Queue) {
 }
 
 // Test for simple blocking pipes
-template<typename PipeName, int TestNumber>
+template <typename PipeName, int TestNumber>
 int test_simple_bl_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::intel::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -224,8 +218,7 @@ int test_simple_bl_pipe(cl::sycl::queue Queue) {
 }
 
 // Test for multiple blocking pipes
-template<int TestNumber>
-int test_multiple_bl_pipe(cl::sycl::queue Queue) {
+template <int TestNumber> int test_multiple_bl_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -261,10 +254,9 @@ int test_multiple_bl_pipe(cl::sycl::queue Queue) {
 }
 
 // Test for array passing through a blocking pipe
-template<int TestNumber>
-int test_array_th_bl_pipe(cl::sycl::queue Queue) {
+template <int TestNumber> int test_array_th_bl_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherBlPipe = cl::sycl::intel::pipe<class another_bl_pipe, int>;
+  using AnotherBlPipe = cl::sycl::ext::intel::pipe<class another_bl_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {
diff --git a/sycl/test/function-pointers/fp-as-kernel-arg.cpp b/sycl/test/function-pointers/fp-as-kernel-arg.cpp
index ba76fdf5dbda8..a21e3d8a7d3f1 100644
--- a/sycl/test/function-pointers/fp-as-kernel-arg.cpp
+++ b/sycl/test/function-pointers/fp-as-kernel-arg.cpp
@@ -15,8 +15,9 @@
 #include <iostream>
 #include <vector>
 
-[[intel::device_indirectly_callable]]
-extern "C" int add(int A, int B) { return A + B; }
+[[intel::device_indirectly_callable]] extern "C" int add(int A, int B) {
+  return A + B;
+}
 
 int main() {
   const int Size = 10;
@@ -31,7 +32,8 @@ int main() {
   P.build_with_kernel_type<class K>();
   cl::sycl::kernel KE = P.get_kernel<class K>();
 
-  auto FptrStorage = cl::sycl::ext::oneapi::get_device_func_ptr(&add, "add", P, D);
+  auto FptrStorage =
+      cl::sycl::ext::oneapi::get_device_func_ptr(&add, "add", P, D);
   if (!D.is_host()) {
     // FIXME: update this check with query to supported extension
     // For now, we don't have runtimes that report required OpenCL extension and
@@ -54,10 +56,10 @@ int main() {
     auto AccB = BufB.template get_access<cl::sycl::access::mode::read>(CGH);
     CGH.parallel_for<class K>(
         KE, cl::sycl::range<1>(Size), [=](cl::sycl::id<1> Index) {
-      auto Fptr =
-          cl::sycl::ext::oneapi::to_device_func_ptr<decltype(add)>(FptrStorage);
-      AccA[Index] = Fptr(AccA[Index], AccB[Index]);
-    });
+          auto Fptr = cl::sycl::ext::oneapi::to_device_func_ptr<decltype(add)>(
+              FptrStorage);
+          AccA[Index] = Fptr(AccA[Index], AccB[Index]);
+        });
   });
 
   auto HostAcc = BufA.get_access<cl::sycl::access::mode::read>();
diff --git a/sycl/test/function-pointers/pass-fp-through-buffer.cpp b/sycl/test/function-pointers/pass-fp-through-buffer.cpp
index 255bbb0212eeb..10132dcc4c1fa 100644
--- a/sycl/test/function-pointers/pass-fp-through-buffer.cpp
+++ b/sycl/test/function-pointers/pass-fp-through-buffer.cpp
@@ -33,7 +33,8 @@ int main() {
   P.build_with_kernel_type<class K>();
   cl::sycl::kernel KE = P.get_kernel<class K>();
 
-  cl::sycl::buffer<cl::sycl::ext::oneapi::device_func_ptr_holder_t> DispatchTable(2);
+  cl::sycl::buffer<cl::sycl::ext::oneapi::device_func_ptr_holder_t>
+      DispatchTable(2);
   {
     auto DTAcc =
         DispatchTable.get_access<cl::sycl::access::mode::discard_write>();
@@ -68,11 +69,11 @@ int main() {
           DispatchTable.template get_access<cl::sycl::access::mode::read>(CGH);
       CGH.parallel_for<class K>(
           KE, cl::sycl::range<1>(Size), [=](cl::sycl::id<1> Index) {
-        auto FP =
-            cl::sycl::ext::oneapi::to_device_func_ptr<int(int, int)>(AccDT[Mode]);
+            auto FP = cl::sycl::ext::oneapi::to_device_func_ptr<int(int, int)>(
+                AccDT[Mode]);
 
-        AccA[Index] = FP(AccA[Index], AccB[Index]);
-      });
+            AccA[Index] = FP(AccA[Index], AccB[Index]);
+          });
     });
 
     auto HostAcc = bufA.get_access<cl::sycl::access::mode::read>();
diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp
index 35ac1266cb11c..258dec907e161 100644
--- a/sycl/test/reduction/reduction_ctor.cpp
+++ b/sycl/test/reduction/reduction_ctor.cpp
@@ -10,7 +10,6 @@
 
 using namespace cl::sycl;
 
-
 template <typename T, typename Reduction>
 void test_reducer(Reduction &Redu, T A, T B) {
   typename Reduction::reducer_type Reducer;
@@ -35,34 +34,25 @@ void test_reducer(Reduction &Redu, T Identity, T A, T B) {
          "Wrong result of binary operation.");
 }
 
-template <typename T, int Dim, class BinaryOperation>
-class Known;
-template <typename T, int Dim, class BinaryOperation>
-class Unknown;
+template <typename T, int Dim, class BinaryOperation> class Known;
+template <typename T, int Dim, class BinaryOperation> class Unknown;
 
-template <typename T>
-struct Point {
+template <typename T> struct Point {
   Point() : X(0), Y(0) {}
   Point(T X, T Y) : X(X), Y(Y) {}
   Point(T V) : X(V), Y(V) {}
-  bool operator==(const Point &P) const {
-    return P.X == X && P.Y == Y;
-  }
+  bool operator==(const Point &P) const { return P.X == X && P.Y == Y; }
   T X;
   T Y;
 };
 
-template <typename T>
-bool operator==(const Point<T> &A, const Point<T> &B) {
+template <typename T> bool operator==(const Point<T> &A, const Point<T> &B) {
   return A.X == B.X && A.Y == B.Y;
 }
 
-template <class T>
-struct PointPlus {
+template <class T> struct PointPlus {
   using P = Point<T>;
-  P operator()(const P &A, const P &B) const {
-    return P(A.X + B.X, A.Y + B.Y);
-  }
+  P operator()(const P &A, const P &B) const { return P(A.X + B.X, A.Y + B.Y); }
 };
 
 template <typename T, int Dim, class BinaryOperation>
@@ -78,8 +68,7 @@ void testKnown(T Identity, T A, T B) {
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         ReduAcc(ReduBuf, CGH);
     auto Redu = ext::oneapi::reduction(ReduAcc, BOp);
-    assert(Redu.getIdentity() == Identity &&
-           "Failed getIdentity() check().");
+    assert(Redu.getIdentity() == Identity && "Failed getIdentity() check().");
     test_reducer(Redu, A, B);
     test_reducer(Redu, Identity, A, B);
 
@@ -100,8 +89,7 @@ void testUnknown(T Identity, T A, T B) {
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         ReduAcc(ReduBuf, CGH);
     auto Redu = ext::oneapi::reduction(ReduAcc, Identity, BOp);
-    assert(Redu.getIdentity() == Identity &&
-           "Failed getIdentity() check().");
+    assert(Redu.getIdentity() == Identity && "Failed getIdentity() check().");
     test_reducer(Redu, Identity, A, B);
 
     // Command group must have at least one task in it. Use an empty one.
@@ -124,16 +112,22 @@ int main() {
   testBoth<int, ext::oneapi::bit_or<int>>(0, 1, 8);
   testBoth<int, ext::oneapi::bit_xor<int>>(0, 7, 3);
   testBoth<int, ext::oneapi::bit_and<int>>(~0, 7, 3);
-  testBoth<int, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
-  testBoth<int, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
+  testBoth<int, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 7,
+                                           3);
+  testBoth<int, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 7,
+                                           3);
 
   testBoth<float, ext::oneapi::plus<float>>(0, 1, 7);
   testBoth<float, std::multiplies<float>>(1, 1, 7);
-  testBoth<float, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
-  testBoth<float, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
-
-  testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
-  testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
+  testBoth<float, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 7,
+                                               3);
+  testBoth<float, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 7,
+                                               3);
+
+  testUnknown<Point<float>, 0, PointPlus<float>>(
+      Point<float>(0), Point<float>(1), Point<float>(7));
+  testUnknown<Point<float>, 1, PointPlus<float>>(
+      Point<float>(0), Point<float>(1), Point<float>(7));
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_ext_type.hpp b/sycl/test/reduction/reduction_nd_ext_type.hpp
index 9cdfbb8dce2fc..84e21291cc5da 100644
--- a/sycl/test/reduction/reduction_nd_ext_type.hpp
+++ b/sycl/test/reduction/reduction_nd_ext_type.hpp
@@ -8,8 +8,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, access::mode Mode, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -28,8 +27,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
   queue Q;
   Q.submit([&](handler &CGH) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
-    accessor<T, Dim, Mode, access::target::global_buffer>
-        Out(OutBuf, CGH);
+    accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
     auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
@@ -44,18 +42,18 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
   // Check correctness.
   auto Out = OutBuf.template get_access<access::mode::read>();
   T ComputedOut = *(Out.get_pointer());
-  T MaxDiff = 3 * std::numeric_limits<T>::epsilon() * std::fabs(ComputedOut + CorrectOut);
+  T MaxDiff = 3 * std::numeric_limits<T>::epsilon() *
+              std::fabs(ComputedOut + CorrectOut);
   if (std::fabs(static_cast<T>(ComputedOut - CorrectOut)) > MaxDiff) {
     std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
     std::cout << "Computed value: " << ComputedOut
-              << ", Expected value: " << CorrectOut
-              << ", MaxDiff = " << MaxDiff << "\n";
+              << ", Expected value: " << CorrectOut << ", MaxDiff = " << MaxDiff
+              << "\n";
     assert(0 && "Wrong value.");
   }
 }
 
-template <typename T>
-int runTests(const string_class &ExtensionName) {
+template <typename T> int runTests(const string_class &ExtensionName) {
   device D = default_selector().select_device();
   if (!D.is_host() && !D.has_extension(ExtensionName)) {
     std::cout << "Test skipped\n";
@@ -66,13 +64,17 @@ int runTests(const string_class &ExtensionName) {
   test<T, 1, access::mode::read_write, std::multiplies<T>>(0, 4, 4);
   test<T, 0, access::mode::discard_write, ext::oneapi::plus<T>>(0, 4, 64);
 
-  test<T, 0, access::mode::read_write, ext::oneapi::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
-  test<T, 1, access::mode::discard_write, ext::oneapi::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
+  test<T, 0, access::mode::read_write, ext::oneapi::minimum<T>>(
+      getMaximumFPValue<T>(), 7, 7);
+  test<T, 1, access::mode::discard_write, ext::oneapi::maximum<T>>(
+      getMinimumFPValue<T>(), 7, 7 * 5);
 
 #if __cplusplus >= 201402L
   test<T, 1, access::mode::read_write, ext::oneapi::plus<>>(1, 3, 3 * 5);
-  test<T, 1, access::mode::discard_write, ext::oneapi::minimum<>>(getMaximumFPValue<T>(), 3, 3);
-  test<T, 0, access::mode::discard_write, ext::oneapi::maximum<>>(getMinimumFPValue<T>(), 3, 3);
+  test<T, 1, access::mode::discard_write, ext::oneapi::minimum<>>(
+      getMaximumFPValue<T>(), 3, 3);
+  test<T, 0, access::mode::discard_write, ext::oneapi::maximum<>>(
+      getMinimumFPValue<T>(), 3, 3);
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
index 356038d9f38b6..912101a25c5e0 100644
--- a/sycl/test/reduction/reduction_nd_s0_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -16,8 +16,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -79,16 +78,21 @@ int main() {
   test<int, 0, ext::oneapi::bit_or<int>>(0, 8, 256);
   test<int, 0, ext::oneapi::bit_xor<int>>(0, 8, 256);
   test<int, 0, ext::oneapi::bit_and<int>>(~0, 8, 256);
-  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8,
+                                          256);
+  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8,
+                                          256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8,
+                                              256);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8,
+                                              256);
 
   // Check with CUSTOM type.
-  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(
+      CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
index 88f408a1c4f8d..1bb3059f97f88 100644
--- a/sycl/test/reduction/reduction_nd_s0_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -16,8 +16,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -81,16 +80,21 @@ int main() {
   test<int, 0, ext::oneapi::bit_or<int>>(0, 8, 256);
   test<int, 0, ext::oneapi::bit_xor<int>>(0, 8, 256);
   test<int, 0, ext::oneapi::bit_and<int>>(~0, 8, 256);
-  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 0, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8,
+                                          256);
+  test<int, 0, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8,
+                                          256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 0, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8,
+                                              256);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8,
+                                              256);
 
   // Check with CUSTOM type.
-  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(
+      CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
index 68b8e7cafb811..43e04aa2b1e03 100644
--- a/sycl/test/reduction/reduction_nd_s1_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -17,8 +17,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -80,16 +79,21 @@ int main() {
   test<int, 1, ext::oneapi::bit_or<int>>(0, 8, 256);
   test<int, 1, ext::oneapi::bit_xor<int>>(0, 8, 256);
   test<int, 1, ext::oneapi::bit_and<int>>(~0, 8, 256);
-  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8,
+                                          256);
+  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8,
+                                          256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
-  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 8,
+                                              256);
+  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8,
+                                              256);
 
   // Check with CUSTOM type.
-  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(
+      CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
index 68721b3f21544..1261c21a187ef 100644
--- a/sycl/test/reduction/reduction_nd_s1_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -17,8 +17,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -82,16 +81,21 @@ int main() {
   test<int, 1, ext::oneapi::bit_or<int>>(0, 8, 256);
   test<int, 1, ext::oneapi::bit_xor<int>>(0, 8, 256);
   test<int, 1, ext::oneapi::bit_and<int>>(~0, 8, 256);
-  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
-  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
+  test<int, 1, ext::oneapi::minimum<int>>((std::numeric_limits<int>::max)(), 8,
+                                          256);
+  test<int, 1, ext::oneapi::maximum<int>>((std::numeric_limits<int>::min)(), 8,
+                                          256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
-  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 1,
+                                              16);
+  test<float, 1, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 8,
+                                              256);
 
   // Check with CUSTOM type.
-  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(
+      CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp
index 42d19f9d8025a..bc670f9bf7fa9 100644
--- a/sycl/test/reduction/reduction_placeholder.cpp
+++ b/sycl/test/reduction/reduction_placeholder.cpp
@@ -18,8 +18,7 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -33,9 +32,9 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
   (OutBuf.template get_access<access::mode::write>())[0] = Identity;
 
-  auto Out = accessor<T, Dim, access::mode::read_write,
-                      access::target::global_buffer,
-                      access::placeholder::true_t>(OutBuf);
+  auto Out =
+      accessor<T, Dim, access::mode::read_write, access::target::global_buffer,
+               access::placeholder::true_t>(OutBuf);
   // Compute.
   queue Q;
   Q.submit([&](handler &CGH) {
@@ -72,13 +71,16 @@ int main() {
   test<int, 1, ext::oneapi::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
-  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
+  test<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5,
+                                              5 * 7);
+  test<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4,
+                                              128);
 
   // generic algorithm
   test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
   test<int, 1, std::multiplies<int>>(1, 8, 16);
-  test<CustomVec<short>, 0, CustomVecPlus<short>>(CustomVec<short>(0), 8, 8 * 3);
+  test<CustomVec<short>, 0, CustomVecPlus<short>>(CustomVec<short>(0), 8,
+                                                  8 * 3);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
index 6619d85366c86..febb5f9615e84 100644
--- a/sycl/test/reduction/reduction_transparent.cpp
+++ b/sycl/test/reduction/reduction_transparent.cpp
@@ -18,10 +18,8 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeIdClass;
-template <typename T, int Dim, class BinaryOperation>
-class SomeNoIdClass;
+template <typename T, int Dim, class BinaryOperation> class SomeIdClass;
+template <typename T, int Dim, class BinaryOperation> class SomeNoIdClass;
 
 // Checks reductions initialized with transparent functor and explicitly set
 // identity value.
@@ -46,7 +44,8 @@ void testId(T Identity, size_t WGSize, size_t NWItems) {
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
     CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
-        NDRange, ext::oneapi::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+        NDRange, ext::oneapi::reduction(Out, Identity, BOp),
+        [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
@@ -86,7 +85,8 @@ void testNoId(T Identity, size_t WGSize, size_t NWItems) {
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
     CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
-        NDRange, ext::oneapi::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+        NDRange, ext::oneapi::reduction(Out, BOp),
+        [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
diff --git a/sycl/test/reduction/reduction_usm.cpp b/sycl/test/reduction/reduction_usm.cpp
index 6915f86af876e..2c568523c579f 100644
--- a/sycl/test/reduction/reduction_usm.cpp
+++ b/sycl/test/reduction/reduction_usm.cpp
@@ -7,7 +7,8 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
-// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and barrier()
+// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and
+// barrier()
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with USM var.
@@ -18,10 +19,8 @@
 
 using namespace cl::sycl;
 
-template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
-template <typename T, int Dim, class BinaryOperation>
-class Copy1;
+template <typename T, int Dim, class BinaryOperation> class SomeClass;
+template <typename T, int Dim, class BinaryOperation> class Copy1;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
@@ -113,14 +112,16 @@ int main() {
   testUSM<int, 1, ext::oneapi::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  testUSM<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
-  testUSM<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
+  testUSM<float, 1, ext::oneapi::minimum<float>>(getMaximumFPValue<float>(), 5,
+                                                 5 * 7);
+  testUSM<float, 0, ext::oneapi::maximum<float>>(getMinimumFPValue<float>(), 4,
+                                                 128);
 
   // generic algorithm
   testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
   testUSM<int, 1, std::multiplies<int>>(1, 8, 16);
-  testUSM<CustomVec<short>, 0, CustomVecPlus<short>>(
-      CustomVec<short>(0), 8, 8 * 3);
+  testUSM<CustomVec<short>, 0, CustomVecPlus<short>>(CustomVec<short>(0), 8,
+                                                     8 * 3);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/sub_group/attributes.cpp b/sycl/test/sub_group/attributes.cpp
index 28c5a99a7fa8b..d8173d2d1cf72 100644
--- a/sycl/test/sub_group/attributes.cpp
+++ b/sycl/test/sub_group/attributes.cpp
@@ -18,13 +18,13 @@
 
 #include <CL/sycl.hpp>
 
-#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)           \
-  class KernelFunctor##SIZE {                    \
-  public:                                        \
-    [[cl::intel_reqd_sub_group_size(SIZE)]] void \
-    operator()(cl::sycl::nd_item<1> Item) {      \
-      const auto GID = Item.get_global_id();     \
-    }                                            \
+#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)                                         \
+  class KernelFunctor##SIZE {                                                  \
+  public:                                                                      \
+    [[cl::intel_reqd_sub_group_size(SIZE)]] void                               \
+    operator()(cl::sycl::nd_item<1> Item) {                                    \
+      const auto GID = Item.get_global_id();                                   \
+    }                                                                          \
   };
 
 KERNEL_FUNCTOR_WITH_SIZE(1);
@@ -44,8 +44,7 @@ inline uint32_t flp2(uint32_t X) {
   return X - (X >> 1);
 }
 
-template <typename Fn>
-inline void submit(cl::sycl::queue &Q) {
+template <typename Fn> inline void submit(cl::sycl::queue &Q) {
   Q.submit([](cl::sycl::handler &cgh) {
     Fn F;
     cgh.parallel_for(cl::sycl::nd_range<1>{64, 16}, F);
diff --git a/sycl/test/sub_group/barrier.cpp b/sycl/test/sub_group/barrier.cpp
index c8306c0cc18df..fe7d1cce9432a 100644
--- a/sycl/test/sub_group/barrier.cpp
+++ b/sycl/test/sub_group/barrier.cpp
@@ -19,11 +19,9 @@
 #include <CL/sycl.hpp>
 #include <limits>
 #include <numeric>
-template <typename T>
-class sycl_subgr;
+template <typename T> class sycl_subgr;
 using namespace cl::sycl;
-template <typename T>
-void check(queue &Queue, size_t G = 240, size_t L = 60) {
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
   try {
     nd_range<1> NdRange(G, L);
     std::vector<T> data(G);
diff --git a/sycl/test/sub_group/helper.hpp b/sycl/test/sub_group/helper.hpp
index bc88372c870ae..9f4d29ad5e558 100644
--- a/sycl/test/sub_group/helper.hpp
+++ b/sycl/test/sub_group/helper.hpp
@@ -12,14 +12,12 @@
 
 using namespace cl::sycl;
 
-template <typename T1, int N>
-struct utils {
+template <typename T1, int N> struct utils {
   static T1 add_vec(const vec<T1, N> &v);
   static bool cmp_vec(const vec<T1, N> &v, const vec<T1, N> &r);
   static std::string stringify_vec(const vec<T1, N> &v);
 };
-template <typename T2>
-struct utils<T2, 1> {
+template <typename T2> struct utils<T2, 1> {
   static T2 add_vec(const vec<T2, 1> &v) { return v.s0(); }
   static bool cmp_vec(const vec<T2, 1> &v, const vec<T2, 1> &r) {
     return v.s0() == r.s0();
@@ -28,8 +26,7 @@ struct utils<T2, 1> {
     return std::to_string((T2)v.s0());
   }
 };
-template <typename T2>
-struct utils<T2, 2> {
+template <typename T2> struct utils<T2, 2> {
   static T2 add_vec(const vec<T2, 2> &v) { return v.s0() + v.s1(); }
   static bool cmp_vec(const vec<T2, 2> &v, const vec<T2, 2> &r) {
     return v.s0() == r.s0() && v.s1() == r.s1();
@@ -39,8 +36,7 @@ struct utils<T2, 2> {
            std::to_string((T2)v.s1()) + " )";
   }
 };
-template <typename T2>
-struct utils<T2, 4> {
+template <typename T2> struct utils<T2, 4> {
   static T2 add_vec(const vec<T2, 4> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3();
   }
@@ -54,8 +50,7 @@ struct utils<T2, 4> {
            std::to_string((T2)v.s3()) + " )";
   }
 };
-template <typename T2>
-struct utils<T2, 8> {
+template <typename T2> struct utils<T2, 8> {
   static T2 add_vec(const vec<T2, 8> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
            v.s7();
@@ -74,8 +69,7 @@ struct utils<T2, 8> {
   }
 };
 
-template <typename T2>
-struct utils<T2, 16> {
+template <typename T2> struct utils<T2, 16> {
   static T2 add_vec(const vec<T2, 16> &v) {
     return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
            v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() +
@@ -102,8 +96,7 @@ struct utils<T2, 16> {
   }
 };
 
-template <typename T>
-void exit_if_not_equal(T val, T ref, const char *name) {
+template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
   if (std::is_floating_point<T>::value) {
     if (std::fabs(val - ref) > 0.01) {
       std::cout << "Unexpected result for " << name << ": " << (double)val
@@ -120,16 +113,17 @@ void exit_if_not_equal(T val, T ref, const char *name) {
 }
 
 template <typename T>
-void exit_if_not_equal(std::complex<T> val, std::complex<T> ref, const char *name) {
-  if (std::fabs(val.real() - ref.real()) > 0.01 || std::fabs(val.imag() - ref.imag()) > 0.01) {
+void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
+                       const char *name) {
+  if (std::fabs(val.real() - ref.real()) > 0.01 ||
+      std::fabs(val.imag() - ref.imag()) > 0.01) {
     std::cout << "Unexpected result for " << name << ": " << val
               << " expected value: " << ref << std::endl;
     exit(1);
   }
 }
 
-template <typename T>
-void exit_if_not_equal(T *val, T *ref, const char *name) {
+template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
   if ((val - ref) != 0) {
     std::cout << "Unexpected result for " << name << ": " << val
               << " expected value: " << ref << std::endl;
@@ -137,8 +131,7 @@ void exit_if_not_equal(T *val, T *ref, const char *name) {
   }
 }
 
-template <>
-void exit_if_not_equal(half val, half ref, const char *name) {
+template <> void exit_if_not_equal(half val, half ref, const char *name) {
   int16_t cmp_val = reinterpret_cast<int16_t &>(val);
   int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
   if (std::abs(cmp_val - cmp_ref) > 1) {
diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp
index 507b65a8261be..5ba28e904ada8 100644
--- a/sycl/test/sub_group/load_store.cpp
+++ b/sycl/test/sub_group/load_store.cpp
@@ -17,13 +17,11 @@
 
 #include "helper.hpp"
 #include <CL/sycl.hpp>
-template <typename T, int N>
-class sycl_subgr;
+template <typename T, int N> class sycl_subgr;
 
 using namespace cl::sycl;
 
-template <typename T, int N>
-void check(queue &Queue) {
+template <typename T, int N> void check(queue &Queue) {
   const int G = 1024, L = 128;
   try {
     nd_range<1> NdRange(G, L);
@@ -96,8 +94,7 @@ void check(queue &Queue) {
     exit(1);
   }
 }
-template <typename T>
-void check(queue &Queue) {
+template <typename T> void check(queue &Queue) {
   const int G = 128, L = 64;
   try {
     nd_range<1> NdRange(G, L);
diff --git a/sycl/test/sub_group/scan.hpp b/sycl/test/sub_group/scan.hpp
index 42c8c373044f9..408d0c8cff827 100644
--- a/sycl/test/sub_group/scan.hpp
+++ b/sycl/test/sub_group/scan.hpp
@@ -10,8 +10,7 @@
 #include <CL/sycl.hpp>
 #include <limits>
 
-template <typename T, class BinaryOperation>
-class sycl_subgr;
+template <typename T, class BinaryOperation> class sycl_subgr;
 
 using namespace cl::sycl;
 
@@ -73,8 +72,7 @@ void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
   }
 }
 
-template <typename T>
-void check(queue &Queue, size_t G = 120, size_t L = 60) {
+template <typename T> void check(queue &Queue, size_t G = 120, size_t L = 60) {
   // limit data range for half to avoid rounding issues
   if (std::is_same<T, cl::sycl::half>::value) {
     G = 64;
@@ -87,21 +85,21 @@ void check(queue &Queue, size_t G = 120, size_t L = 60) {
   check_op<T>(Queue, T(0), ext::oneapi::minimum<T>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, std::numeric_limits<T>::infinity(), ext::oneapi::minimum<T>(),
-                true, G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(),
+                ext::oneapi::minimum<T>(), true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<T>(), true,
-                G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<T>(),
+                true, G, L);
   }
 
   check_op<T>(Queue, T(G), ext::oneapi::maximum<T>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), ext::oneapi::maximum<T>(),
-                true, G, L);
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(),
+                ext::oneapi::maximum<T>(), true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<T>(), true,
-                G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<T>(),
+                true, G, L);
   }
 
 #if __cplusplus >= 201402L
@@ -111,21 +109,21 @@ void check(queue &Queue, size_t G = 120, size_t L = 60) {
   check_op<T>(Queue, T(0), ext::oneapi::minimum<>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, std::numeric_limits<T>::infinity(), ext::oneapi::minimum<>(),
-                true, G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(),
+                ext::oneapi::minimum<>(), true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<>(), true,
-                G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::max(), ext::oneapi::minimum<>(),
+                true, G, L);
   }
 
   check_op<T>(Queue, T(G), ext::oneapi::maximum<>(), false, G, L);
   if (std::is_floating_point<T>::value ||
       std::is_same<T, cl::sycl::half>::value) {
-    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), ext::oneapi::maximum<>(),
-                true, G, L);
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(),
+                ext::oneapi::maximum<>(), true, G, L);
   } else {
-    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<>(), true,
-                G, L);
+    check_op<T>(Queue, std::numeric_limits<T>::min(), ext::oneapi::maximum<>(),
+                true, G, L);
   }
 #endif
 }

From a32fea12c56bd71f795df01c75c5247b147ae8a6 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Wed, 22 Jul 2020 15:55:42 -0400
Subject: [PATCH 05/13] Resolve paths after merge

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/include/CL/sycl/ext/{oneapi => intel}/fpga_lsu.hpp   | 0
 sycl/include/CL/sycl/ext/{oneapi => intel}/fpga_utils.hpp | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename sycl/include/CL/sycl/ext/{oneapi => intel}/fpga_lsu.hpp (100%)
 rename sycl/include/CL/sycl/ext/{oneapi => intel}/fpga_utils.hpp (100%)

diff --git a/sycl/include/CL/sycl/ext/oneapi/fpga_lsu.hpp b/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp
similarity index 100%
rename from sycl/include/CL/sycl/ext/oneapi/fpga_lsu.hpp
rename to sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp
diff --git a/sycl/include/CL/sycl/ext/oneapi/fpga_utils.hpp b/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp
similarity index 100%
rename from sycl/include/CL/sycl/ext/oneapi/fpga_utils.hpp
rename to sycl/include/CL/sycl/ext/intel/fpga_utils.hpp

From a671d917ac956e7fbd12d2662382968a44e39994 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Thu, 23 Jul 2020 15:56:22 -0400
Subject: [PATCH 06/13] Update FPGA tests

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/test/fpga_tests/fpga_io_pipes.cpp |  2 +-
 sycl/test/fpga_tests/fpga_lsu.cpp      | 26 +++++++++++++-------------
 sycl/test/fpga_tests/fpga_pipes.cpp    | 12 ++++++------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sycl/test/fpga_tests/fpga_io_pipes.cpp b/sycl/test/fpga_tests/fpga_io_pipes.cpp
index 20f3c489bded0..c21704f3478f5 100644
--- a/sycl/test/fpga_tests/fpga_io_pipes.cpp
+++ b/sycl/test/fpga_tests/fpga_io_pipes.cpp
@@ -113,7 +113,7 @@ int test_io_bl_pipe(cl::sycl::queue Queue) {
 }
 
 int main() {
-  cl::sycl::queue Queue{cl::sycl::intel::fpga_emulator_selector{}};
+  cl::sycl::queue Queue{cl::sycl::ext::intel::fpga_emulator_selector{}};
 
   if (!Queue.get_device()
            .get_info<cl::sycl::info::device::kernel_kernel_pipe_support>()) {
diff --git a/sycl/test/fpga_tests/fpga_lsu.cpp b/sycl/test/fpga_tests/fpga_lsu.cpp
index 65b35e09dbd69..f63343edd9db5 100644
--- a/sycl/test/fpga_tests/fpga_lsu.cpp
+++ b/sycl/test/fpga_tests/fpga_lsu.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <CL/sycl/ext/intel/fpga_extensions.hpp>
 
 // TODO: run is disabled, since no support added in FPGA backend yet. Check
 // implementation correctness from CXX and SYCL languages perspective.
@@ -38,20 +38,20 @@ int test_lsu(cl::sycl::queue Queue) {
         auto input_ptr = input_accessor.get_pointer();
         auto output_ptr = output_accessor.get_pointer();
 
-        using PrefetchingLSU =
-            cl::sycl::intel::lsu<cl::sycl::intel::prefetch<true>,
-                                 cl::sycl::intel::statically_coalesce<false>>;
+        using PrefetchingLSU = cl::sycl::ext::intel::lsu<
+            cl::sycl::ext::intel::prefetch<true>,
+            cl::sycl::ext::intel::statically_coalesce<false>>;
 
-        using BurstCoalescedLSU =
-            cl::sycl::intel::lsu<cl::sycl::intel::burst_coalesce<true>,
-                                 cl::sycl::intel::statically_coalesce<false>>;
+        using BurstCoalescedLSU = cl::sycl::ext::intel::lsu<
+            cl::sycl::ext::intel::burst_coalesce<true>,
+            cl::sycl::ext::intel::statically_coalesce<false>>;
 
-        using CachingLSU =
-            cl::sycl::intel::lsu<cl::sycl::intel::burst_coalesce<true>,
-                                 cl::sycl::intel::cache<1024>,
-                                 cl::sycl::intel::statically_coalesce<false>>;
+        using CachingLSU = cl::sycl::ext::intel::lsu<
+            cl::sycl::ext::intel::burst_coalesce<true>,
+            cl::sycl::ext::intel::cache<1024>,
+            cl::sycl::ext::intel::statically_coalesce<false>>;
 
-        using PipelinedLSU = cl::sycl::intel::lsu<>;
+        using PipelinedLSU = cl::sycl::ext::intel::lsu<>;
 
         int X = PrefetchingLSU::load(input_ptr); // int X = input_ptr[0]
         int Y = CachingLSU::load(input_ptr + 1); // int Y = input_ptr[1]
@@ -74,7 +74,7 @@ int test_lsu(cl::sycl::queue Queue) {
 }
 
 int main() {
-  cl::sycl::queue Queue{cl::sycl::intel::fpga_emulator_selector{}};
+  cl::sycl::queue Queue{cl::sycl::ext::intel::fpga_emulator_selector{}};
 
   return test_lsu(Queue);
 }
diff --git a/sycl/test/fpga_tests/fpga_pipes.cpp b/sycl/test/fpga_tests/fpga_pipes.cpp
index e3496e0168878..6539cf75f926b 100644
--- a/sycl/test/fpga_tests/fpga_pipes.cpp
+++ b/sycl/test/fpga_tests/fpga_pipes.cpp
@@ -31,7 +31,7 @@ template <int N> class templ_nb_pipe;
 
 // For non-blocking multiple pipes
 template <int N>
-using PipeMulNb = cl::sycl::intel::pipe<class templ_nb_pipe<N>, int>;
+using PipeMulNb = cl::sycl::ext::oneapi::pipe<class templ_nb_pipe<N>, int>;
 
 // For simple blocking pipes with explicit type
 class some_bl_pipe;
@@ -46,7 +46,7 @@ template <int N> class templ_bl_pipe;
 
 // For blocking multiple pipes
 template <int N>
-using PipeMulBl = cl::sycl::ext::intel::pipe<class templ_bl_pipe<N>, int>;
+using PipeMulBl = cl::sycl::ext::oneapi::pipe<class templ_bl_pipe<N>, int>;
 
 // Kernel names
 template <int TestNumber, int KernelNumber = 0> class writer;
@@ -57,7 +57,7 @@ template <typename PipeName, int TestNumber>
 int test_simple_nb_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::oneapi::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -146,7 +146,7 @@ template <int TestNumber> int test_multiple_nb_pipe(cl::sycl::queue Queue) {
 // Test for array passing through a non-blocking pipe
 template <int TestNumber> int test_array_th_nb_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherNbPipe = cl::sycl::ext::intel::pipe<class another_nb_pipe, int>;
+  using AnotherNbPipe = cl::sycl::ext::oneapi::pipe<class another_nb_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {
@@ -188,7 +188,7 @@ template <typename PipeName, int TestNumber>
 int test_simple_bl_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::oneapi::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -256,7 +256,7 @@ template <int TestNumber> int test_multiple_bl_pipe(cl::sycl::queue Queue) {
 // Test for array passing through a blocking pipe
 template <int TestNumber> int test_array_th_bl_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherBlPipe = cl::sycl::ext::intel::pipe<class another_bl_pipe, int>;
+  using AnotherBlPipe = cl::sycl::ext::oneapi::pipe<class another_bl_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {

From c25fa07f9c971de39f840ccb614d8040f338b684 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Thu, 23 Jul 2020 16:16:51 -0400
Subject: [PATCH 07/13] Update header file paths

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/include/CL/sycl/ext/oneapi/atomic_accessor.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/CL/sycl/ext/oneapi/atomic_accessor.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_accessor.hpp
index faa8ef745365e..40286132566d7 100644
--- a/sycl/include/CL/sycl/ext/oneapi/atomic_accessor.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_accessor.hpp
@@ -9,8 +9,8 @@
 #pragma once
 
 #include <CL/sycl/access/access.hpp>
-#include <CL/sycl/intel/atomic_enums.hpp>
-#include <CL/sycl/intel/atomic_ref.hpp>
+#include <CL/sycl/ext/oneapi/atomic_enums.hpp>
+#include <CL/sycl/ext/oneapi/atomic_ref.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {

From 3e101b549ad64c3199adbe62d9ac6e0a6bfb1516 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Thu, 23 Jul 2020 17:19:17 -0400
Subject: [PATCH 08/13] fix tests.

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 .../include/CL/sycl/ext/oneapi/atomic_ref.hpp |  1 +
 sycl/test/atomic_ref/accessor.cpp             | 31 ++++++++++---------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
index 19bfbefefb3a2..91c9472b8e82c 100644
--- a/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
+++ b/sycl/include/CL/sycl/ext/oneapi/atomic_ref.hpp
@@ -13,6 +13,7 @@
 #include <CL/sycl/atomic.hpp>
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/spirv.hpp>
 #include <CL/sycl/ext/oneapi/atomic_enums.hpp>
 
 #ifndef __SYCL_DEVICE_ONLY__
diff --git a/sycl/test/atomic_ref/accessor.cpp b/sycl/test/atomic_ref/accessor.cpp
index 86067e2c74906..115add88c2a4a 100644
--- a/sycl/test/atomic_ref/accessor.cpp
+++ b/sycl/test/atomic_ref/accessor.cpp
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 using namespace sycl;
-using namespace sycl::intel;
+using namespace sycl::ext::oneapi;
 
 // Equivalent to add_test from add.cpp
 // Uses atomic_accessor instead of atomic_ref
@@ -24,14 +24,16 @@ template <typename T> void accessor_test(queue q, size_t N) {
     q.submit([&](handler &cgh) {
 #if __cplusplus > 201402L
       static_assert(
-          std::is_same<decltype(atomic_accessor(sum_buf, cgh, relaxed_order,
-                                                device_scope)),
-                       atomic_accessor<T, 1, intel::memory_order::relaxed,
-                                       intel::memory_scope::device>>::value,
+          std::is_same<
+              decltype(
+                  atomic_accessor(sum_buf, cgh, relaxed_order, device_scope)),
+              atomic_accessor<T, 1, ext::oneapi::memory_order::relaxed,
+                              ext::oneapi::memory_scope::device>>::value,
           "atomic_accessor type incorrectly deduced");
 #endif
-      auto sum = atomic_accessor<T, 1, intel::memory_order::relaxed,
-                                 intel::memory_scope::device>(sum_buf, cgh);
+      auto sum =
+          atomic_accessor<T, 1, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device>(
+              sum_buf, cgh);
       auto out =
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
@@ -39,8 +41,8 @@ template <typename T> void accessor_test(queue q, size_t N) {
         static_assert(
             std::is_same<
                 decltype(sum[0]),
-                atomic_ref<T, intel::memory_order::relaxed,
-                           intel::memory_scope::device,
+                atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                           ext::oneapi::memory_scope::device,
                            access::address_space::global_space>>::value,
             "atomic_accessor returns incorrect atomic_ref");
         out[gid] = sum[0].fetch_add(T(1));
@@ -69,10 +71,9 @@ void local_accessor_test(queue q, size_t N, size_t L = 8) {
   {
     buffer<T> output_buf(output.data(), output.size());
     q.submit([&](handler &cgh) {
-      auto sum =
-          atomic_accessor<T, 1, intel::memory_order::relaxed,
-                          intel::memory_scope::device, access::target::local>(
-              1, cgh);
+      auto sum = atomic_accessor<T, 1, ext::oneapi::memory_order::relaxed,
+                                 ext::oneapi::memory_scope::device,
+                                 access::target::local>(1, cgh);
       auto out = output_buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for(nd_range<1>(N, L), [=](nd_item<1> it) {
         int grp = it.get_group(0);
@@ -80,8 +81,8 @@ void local_accessor_test(queue q, size_t N, size_t L = 8) {
         it.barrier();
         static_assert(
             std::is_same<decltype(sum[0]),
-                         atomic_ref<T, intel::memory_order::relaxed,
-                                    intel::memory_scope::device,
+                         atomic_ref<T, ext::oneapi::memory_order::relaxed,
+                                    ext::oneapi::memory_scope::device,
                                     access::address_space::local_space>>::value,
             "local atomic_accessor returns incorrect atomic_ref");
         T result = sum[0].fetch_add(T(1));

From 6a6a31dddf207ee1c932b92aebd32bc0990420a9 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Fri, 24 Jul 2020 14:21:03 -0400
Subject: [PATCH 09/13] Revert pipes to ext::intel for consistency.  Fix tests.

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp   |  2 +-
 sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp          |  2 ++
 sycl/include/CL/sycl/ext/intel/fpga_utils.hpp        |  2 ++
 sycl/include/CL/sycl/ext/{oneapi => intel}/pipes.hpp |  4 ++--
 sycl/include/CL/sycl/pipes.hpp                       |  4 ++--
 sycl/test/fpga_tests/fpga_pipes.cpp                  | 12 ++++++------
 sycl/test/fpga_tests/io_pipe_def.h                   |  6 +++---
 sycl/test/regression/esimd-util-compiler-eval.cpp    |  2 +-
 8 files changed, 19 insertions(+), 15 deletions(-)
 rename sycl/include/CL/sycl/ext/{oneapi => intel}/pipes.hpp (99%)

diff --git a/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
index 3d3dac6fae2aa..9b019db1c1d41 100644
--- a/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp
@@ -10,4 +10,4 @@
 #include <CL/sycl/ext/intel/fpga_device_selector.hpp>
 #include <CL/sycl/ext/intel/fpga_lsu.hpp>
 #include <CL/sycl/ext/intel/fpga_reg.hpp>
-#include <CL/sycl/ext/oneapi/pipes.hpp>
+#include <CL/sycl/ext/intel/pipes.hpp>
diff --git a/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp b/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp
index 5f8d37f802e76..2b8324970658a 100644
--- a/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp
@@ -13,6 +13,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 constexpr uint8_t BURST_COALESCE = 0x1;
 constexpr uint8_t CACHE = 0x2;
@@ -109,5 +110,6 @@ template <class... mem_access_params> class lsu final {
   }
 };
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp b/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp
index be9bf1a6fc5af..60324b4149c12 100644
--- a/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp
+++ b/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp
@@ -13,6 +13,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
 namespace intel {
 
 template <template <int32_t> class Type, class T>
@@ -29,5 +30,6 @@ struct GetValue<Type, T1, T...> {
                        GetValue<Type, T...>>::type::value;
 };
 } // namespace intel
+} // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/ext/oneapi/pipes.hpp b/sycl/include/CL/sycl/ext/intel/pipes.hpp
similarity index 99%
rename from sycl/include/CL/sycl/ext/oneapi/pipes.hpp
rename to sycl/include/CL/sycl/ext/intel/pipes.hpp
index cfa906654a4b8..68af9728cdb52 100644
--- a/sycl/include/CL/sycl/ext/oneapi/pipes.hpp
+++ b/sycl/include/CL/sycl/ext/intel/pipes.hpp
@@ -15,7 +15,7 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
-namespace oneapi {
+namespace intel {
 
 template <class name, class dataT, int32_t min_capacity = 0> class pipe {
 public:
@@ -199,7 +199,7 @@ class kernel_writeable_io_pipe {
 #endif // __SYCL_DEVICE_ONLY__
 };
 
-} // namespace oneapi
+} // namespace intel
 } // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/pipes.hpp b/sycl/include/CL/sycl/pipes.hpp
index 647c9a2a8aee8..a5bb42bf38c8b 100644
--- a/sycl/include/CL/sycl/pipes.hpp
+++ b/sycl/include/CL/sycl/pipes.hpp
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <CL/sycl/ext/oneapi/pipes.hpp>
+#include <CL/sycl/ext/intel/pipes.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 template <class name, class dataT, int32_t min_capacity = 0>
-using pipe = ext::oneapi::pipe<name, dataT, min_capacity>;
+using pipe = ext::intel::pipe<name, dataT, min_capacity>;
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/fpga_tests/fpga_pipes.cpp b/sycl/test/fpga_tests/fpga_pipes.cpp
index 6539cf75f926b..e84bfc03739f6 100644
--- a/sycl/test/fpga_tests/fpga_pipes.cpp
+++ b/sycl/test/fpga_tests/fpga_pipes.cpp
@@ -31,7 +31,7 @@ template <int N> class templ_nb_pipe;
 
 // For non-blocking multiple pipes
 template <int N>
-using PipeMulNb = cl::sycl::ext::oneapi::pipe<class templ_nb_pipe<N>, int>;
+using PipeMulNb = cl::sycl::ext::intel::pipe<class templ_nb_pipe<N>, int>;
 
 // For simple blocking pipes with explicit type
 class some_bl_pipe;
@@ -46,7 +46,7 @@ template <int N> class templ_bl_pipe;
 
 // For blocking multiple pipes
 template <int N>
-using PipeMulBl = cl::sycl::ext::oneapi::pipe<class templ_bl_pipe<N>, int>;
+using PipeMulBl = cl::sycl::ext::intel::pipe<class templ_bl_pipe<N>, int>;
 
 // Kernel names
 template <int TestNumber, int KernelNumber = 0> class writer;
@@ -57,7 +57,7 @@ template <typename PipeName, int TestNumber>
 int test_simple_nb_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::ext::oneapi::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -146,7 +146,7 @@ template <int TestNumber> int test_multiple_nb_pipe(cl::sycl::queue Queue) {
 // Test for array passing through a non-blocking pipe
 template <int TestNumber> int test_array_th_nb_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherNbPipe = cl::sycl::ext::oneapi::pipe<class another_nb_pipe, int>;
+  using AnotherNbPipe = cl::sycl::ext::intel::pipe<class another_nb_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {
@@ -188,7 +188,7 @@ template <typename PipeName, int TestNumber>
 int test_simple_bl_pipe(cl::sycl::queue Queue) {
   int data[] = {0};
 
-  using Pipe = cl::sycl::ext::oneapi::pipe<PipeName, int>;
+  using Pipe = cl::sycl::ext::intel::pipe<PipeName, int>;
 
   cl::sycl::buffer<int, 1> readBuf(data, 1);
   Queue.submit([&](cl::sycl::handler &cgh) {
@@ -256,7 +256,7 @@ template <int TestNumber> int test_multiple_bl_pipe(cl::sycl::queue Queue) {
 // Test for array passing through a blocking pipe
 template <int TestNumber> int test_array_th_bl_pipe(cl::sycl::queue Queue) {
   int data[N] = {0};
-  using AnotherBlPipe = cl::sycl::ext::oneapi::pipe<class another_bl_pipe, int>;
+  using AnotherBlPipe = cl::sycl::ext::intel::pipe<class another_bl_pipe, int>;
 
   Queue.submit([&](cl::sycl::handler &cgh) {
     cgh.single_task<class writer<TestNumber>>([=]() {
diff --git a/sycl/test/fpga_tests/io_pipe_def.h b/sycl/test/fpga_tests/io_pipe_def.h
index bbfa2f3a0a49d..e86c587be056b 100644
--- a/sycl/test/fpga_tests/io_pipe_def.h
+++ b/sycl/test/fpga_tests/io_pipe_def.h
@@ -1,4 +1,4 @@
-#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <CL/sycl/ext/intel/fpga_extensions.hpp>
 
 namespace intelfpga {
 template <unsigned ID> struct ethernet_pipe_id {
@@ -6,7 +6,7 @@ template <unsigned ID> struct ethernet_pipe_id {
 };
 
 using ethernet_read_pipe =
-    sycl::intel::kernel_readable_io_pipe<ethernet_pipe_id<0>, int, 0>;
+    sycl::ext::intel::kernel_readable_io_pipe<ethernet_pipe_id<0>, int, 0>;
 using ethernet_write_pipe =
-    sycl::intel::kernel_writeable_io_pipe<ethernet_pipe_id<1>, int, 0>;
+    sycl::ext::intel::kernel_writeable_io_pipe<ethernet_pipe_id<1>, int, 0>;
 } // namespace intelfpga
diff --git a/sycl/test/regression/esimd-util-compiler-eval.cpp b/sycl/test/regression/esimd-util-compiler-eval.cpp
index 4d494f1b5e78e..f1dac23f9e675 100644
--- a/sycl/test/regression/esimd-util-compiler-eval.cpp
+++ b/sycl/test/regression/esimd-util-compiler-eval.cpp
@@ -2,7 +2,7 @@
 // This test checks compile-time evaluation of functions from esimd_util.hpp
 
 #include "CL/sycl.hpp"
-#include "CL/sycl/intel/esimd/esimd.hpp"
+#include "CL/sycl/ext/intel/esimd/esimd.hpp"
 
 static_assert(__esimd::getNextPowerOf2<0>() == 0, "");
 static_assert(__esimd::getNextPowerOf2<1>() == 1, "");

From 96dbe71bd5e685df96cf69a1dce19fdf5d8e827c Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Fri, 24 Jul 2020 14:29:13 -0400
Subject: [PATCH 10/13] Update clang to handle new spec constant namespace.
 clang-format test fix.

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 clang/lib/Sema/SemaSYCL.cpp       | 5 +++--
 sycl/test/atomic_ref/accessor.cpp | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 8cbd33b279e8f..e89cead64ca30 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -2812,10 +2812,11 @@ bool Util::isSyclHalfType(const QualType &Ty) {
 
 bool Util::isSyclSpecConstantType(const QualType &Ty) {
   const StringRef &Name = "spec_constant";
-  std::array<DeclContextDesc, 4> Scopes = {
+  std::array<DeclContextDesc, 5> Scopes = {
       Util::DeclContextDesc{clang::Decl::Kind::Namespace, "cl"},
       Util::DeclContextDesc{clang::Decl::Kind::Namespace, "sycl"},
-      Util::DeclContextDesc{clang::Decl::Kind::Namespace, "experimental"},
+      Util::DeclContextDesc{clang::Decl::Kind::Namespace, "ext"},
+      Util::DeclContextDesc{clang::Decl::Kind::Namespace, "oneapi"},
       Util::DeclContextDesc{Decl::Kind::ClassTemplateSpecialization, Name}};
   return matchQualifiedTypeName(Ty, Scopes);
 }
diff --git a/sycl/test/atomic_ref/accessor.cpp b/sycl/test/atomic_ref/accessor.cpp
index 115add88c2a4a..c511c07a9c9b3 100644
--- a/sycl/test/atomic_ref/accessor.cpp
+++ b/sycl/test/atomic_ref/accessor.cpp
@@ -32,8 +32,8 @@ template <typename T> void accessor_test(queue q, size_t N) {
           "atomic_accessor type incorrectly deduced");
 #endif
       auto sum =
-          atomic_accessor<T, 1, ext::oneapi::memory_order::relaxed, ext::oneapi::memory_scope::device>(
-              sum_buf, cgh);
+          atomic_accessor<T, 1, ext::oneapi::memory_order::relaxed,
+                          ext::oneapi::memory_scope::device>(sum_buf, cgh);
       auto out =
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {

From c70eed5a88cacab60296401b0e8e7dd275f408be Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Fri, 24 Jul 2020 16:04:07 -0400
Subject: [PATCH 11/13] Update spec constant clang test to new namespace

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 .../CodeGenSYCL/int_header_spec_const.cpp     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGenSYCL/int_header_spec_const.cpp b/clang/test/CodeGenSYCL/int_header_spec_const.cpp
index e6743c4ea2e91..b43619aeb8e74 100644
--- a/clang/test/CodeGenSYCL/int_header_spec_const.cpp
+++ b/clang/test/CodeGenSYCL/int_header_spec_const.cpp
@@ -20,18 +20,18 @@ class MyDoubleConst;
 
 int main() {
   // Create specialization constants.
-  cl::sycl::experimental::spec_constant<bool, MyBoolConst> i1(false);
-  cl::sycl::experimental::spec_constant<char, MyInt8Const> i8(0);
-  cl::sycl::experimental::spec_constant<unsigned char, MyUInt8Const> ui8(0);
-  cl::sycl::experimental::spec_constant<short, MyInt16Const> i16(0);
-  cl::sycl::experimental::spec_constant<unsigned short, MyUInt16Const> ui16(0);
-  cl::sycl::experimental::spec_constant<int, MyInt32Const> i32(0);
+  cl::sycl::ext::intel::spec_constant<bool, MyBoolConst> i1(false);
+  cl::sycl::ext::intel::spec_constant<char, MyInt8Const> i8(0);
+  cl::sycl::ext::intel::spec_constant<unsigned char, MyUInt8Const> ui8(0);
+  cl::sycl::ext::intel::spec_constant<short, MyInt16Const> i16(0);
+  cl::sycl::ext::intel::spec_constant<unsigned short, MyUInt16Const> ui16(0);
+  cl::sycl::ext::intel::spec_constant<int, MyInt32Const> i32(0);
   // Constant used twice, but there must be single entry in the int header,
   // otherwise compilation error would be issued.
-  cl::sycl::experimental::spec_constant<int, MyInt32Const> i32_1(0);
-  cl::sycl::experimental::spec_constant<unsigned int, MyUInt32Const> ui32(0);
-  cl::sycl::experimental::spec_constant<float, MyFloatConst> f32(0);
-  cl::sycl::experimental::spec_constant<double, MyDoubleConst> f64(0);
+  cl::sycl::ext::intel::spec_constant<int, MyInt32Const> i32_1(0);
+  cl::sycl::ext::intel::spec_constant<unsigned int, MyUInt32Const> ui32(0);
+  cl::sycl::ext::intel::spec_constant<float, MyFloatConst> f32(0);
+  cl::sycl::ext::intel::spec_constant<double, MyDoubleConst> f64(0);
 
   double val;
   double *ptr = &val; // to avoid "unused" warnings

From b6a3a664b44cd3de697b1c8c7f686786faf23526 Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Fri, 24 Jul 2020 17:11:54 -0400
Subject: [PATCH 12/13] Fix clang spec constant test.

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 clang/test/CodeGenSYCL/Inputs/sycl.hpp        |  6 ++++--
 .../CodeGenSYCL/int_header_spec_const.cpp     | 20 +++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
index 3184c58edcbfc..82bd00d6b32a3 100644
--- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp
+++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
@@ -242,7 +242,8 @@ struct get_kernel_name_t<auto_name, Type> {
   using name = Type;
 };
 
-namespace experimental {
+namespace ext {
+namespace oneapi {
 template <typename T, typename ID = T>
 class spec_constant {
 public:
@@ -256,7 +257,8 @@ class spec_constant {
     return get();
   }
 };
-} // namespace experimental
+} // namespace oneapi
+} // namespace ext
 
 #define ATTR_SYCL_KERNEL __attribute__((sycl_kernel))
 template <typename KernelName = auto_name, typename KernelType>
diff --git a/clang/test/CodeGenSYCL/int_header_spec_const.cpp b/clang/test/CodeGenSYCL/int_header_spec_const.cpp
index b43619aeb8e74..31d26dfc67e4b 100644
--- a/clang/test/CodeGenSYCL/int_header_spec_const.cpp
+++ b/clang/test/CodeGenSYCL/int_header_spec_const.cpp
@@ -20,18 +20,18 @@ class MyDoubleConst;
 
 int main() {
   // Create specialization constants.
-  cl::sycl::ext::intel::spec_constant<bool, MyBoolConst> i1(false);
-  cl::sycl::ext::intel::spec_constant<char, MyInt8Const> i8(0);
-  cl::sycl::ext::intel::spec_constant<unsigned char, MyUInt8Const> ui8(0);
-  cl::sycl::ext::intel::spec_constant<short, MyInt16Const> i16(0);
-  cl::sycl::ext::intel::spec_constant<unsigned short, MyUInt16Const> ui16(0);
-  cl::sycl::ext::intel::spec_constant<int, MyInt32Const> i32(0);
+  cl::sycl::ext::oneapi::spec_constant<bool, MyBoolConst> i1(false);
+  cl::sycl::ext::oneapi::spec_constant<char, MyInt8Const> i8(0);
+  cl::sycl::ext::oneapi::spec_constant<unsigned char, MyUInt8Const> ui8(0);
+  cl::sycl::ext::oneapi::spec_constant<short, MyInt16Const> i16(0);
+  cl::sycl::ext::oneapi::spec_constant<unsigned short, MyUInt16Const> ui16(0);
+  cl::sycl::ext::oneapi::spec_constant<int, MyInt32Const> i32(0);
   // Constant used twice, but there must be single entry in the int header,
   // otherwise compilation error would be issued.
-  cl::sycl::ext::intel::spec_constant<int, MyInt32Const> i32_1(0);
-  cl::sycl::ext::intel::spec_constant<unsigned int, MyUInt32Const> ui32(0);
-  cl::sycl::ext::intel::spec_constant<float, MyFloatConst> f32(0);
-  cl::sycl::ext::intel::spec_constant<double, MyDoubleConst> f64(0);
+  cl::sycl::ext::oneapi::spec_constant<int, MyInt32Const> i32_1(0);
+  cl::sycl::ext::oneapi::spec_constant<unsigned int, MyUInt32Const> ui32(0);
+  cl::sycl::ext::oneapi::spec_constant<float, MyFloatConst> f32(0);
+  cl::sycl::ext::oneapi::spec_constant<double, MyDoubleConst> f64(0);
 
   double val;
   double *ptr = &val; // to avoid "unused" warnings

From 5c111f42c551dfe5807a4938fc05cb80b7f125cc Mon Sep 17 00:00:00 2001
From: James Brodman <james.brodman@intel.com>
Date: Mon, 27 Jul 2020 13:44:15 -0400
Subject: [PATCH 13/13] Update tests.

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 sycl/test/aot/spec_const_aot.cpp                     | 2 +-
 sycl/test/fpga_tests/global_fpga_device_selector.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/test/aot/spec_const_aot.cpp b/sycl/test/aot/spec_const_aot.cpp
index 99b451fe6d7ca..012709d10fd7f 100644
--- a/sycl/test/aot/spec_const_aot.cpp
+++ b/sycl/test/aot/spec_const_aot.cpp
@@ -35,7 +35,7 @@ int main(int argc, char **argv) {
   std::cout << "Running on " << q.get_device().get_info<info::device::name>() << "\n";
   cl::sycl::program prog(q.get_context());
 
-  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+  cl::sycl::ext::intel::spec_constant<int32_t, MyInt32Const> i32 =
       prog.set_spec_constant<MyInt32Const>(10);
 
   prog.build_with_kernel_type<Kernel>();
diff --git a/sycl/test/fpga_tests/global_fpga_device_selector.cpp b/sycl/test/fpga_tests/global_fpga_device_selector.cpp
index edd2007cd9bf1..865a5ceb9cdc2 100644
--- a/sycl/test/fpga_tests/global_fpga_device_selector.cpp
+++ b/sycl/test/fpga_tests/global_fpga_device_selector.cpp
@@ -4,13 +4,13 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include <CL/sycl.hpp>
-#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <CL/sycl/ext/intel/fpga_extensions.hpp>
 
 // Check that FPGA emulator device is found if we try to initialize inline global
 // variable using fpga_emulator_selector parameter.
 
 inline cl::sycl::queue fpga_emu_queue_inlined{
-    cl::sycl::intel::fpga_emulator_selector{}};
+    cl::sycl::ext::intel::fpga_emulator_selector{}};
 
 int main() {
   return 0;