From 769d3b90a4525543ab748904cf640cf81117e99e Mon Sep 17 00:00:00 2001
From: Ilya Stepykin <ilya.stepykin@intel.com>
Date: Wed, 8 Jul 2020 13:52:17 +0300
Subject: [PATCH 1/5] [SYCL] USM shared memory allocator for L0 plugin

In L0 each allocation results in at least a memory page
no matter how small the size was requested. This adds
significant overhead when an app does many small allocations.

This patch adds a memory allocator on top of L0 USM API in
order to solve the problem.

High level description:
There is a predefine list of bucket sizes. When allocation function
is called, the best fitted bucket is found for the requested size.
If there is a free chunk in the bucket then it's returned, otherwise
a new slab of size 64k requested from the system via L0 API.
This slab is split into chunks of the size of corresponding bucket.
Now there is a free chunk in the slab which is returned.
When a chunk is returned it's marked as non-free until the user
call free with a corresponding pointer.

In addition to that add an environment variable
SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR which disables the allocator
and returns back to the original behavior.

Signed-off-by: Ilya Stepykin <ilya.stepykin@intel.com>
---
 sycl/doc/EnvironmentVariables.md          |   1 +
 sycl/plugins/level_zero/CMakeLists.txt    |   2 +
 sycl/plugins/level_zero/pi_level_zero.cpp | 195 ++++++++-
 sycl/plugins/level_zero/pi_level_zero.hpp |  62 ++-
 sycl/plugins/level_zero/usm_allocator.cpp | 505 ++++++++++++++++++++++
 sycl/plugins/level_zero/usm_allocator.hpp |  39 ++
 sycl/test/usm/dmem_varied.cpp             | 115 +++++
 sycl/test/usm/smem_concurrent.cpp         | 123 ++++++
 sycl/test/usm/smem_varied.cpp             | 108 +++++
 9 files changed, 1137 insertions(+), 13 deletions(-)
 create mode 100644 sycl/plugins/level_zero/usm_allocator.cpp
 create mode 100644 sycl/plugins/level_zero/usm_allocator.hpp
 create mode 100644 sycl/test/usm/dmem_varied.cpp
 create mode 100644 sycl/test/usm/smem_concurrent.cpp
 create mode 100644 sycl/test/usm/smem_varied.cpp

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 3db6cd7e7e100..eb8c2f13fb4e9 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -26,6 +26,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
+| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in L0 plugin(each memory request will go directly to L0 runtine)  |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt
index fa6196c501606..3e8483180b503 100755
--- a/sycl/plugins/level_zero/CMakeLists.txt
+++ b/sycl/plugins/level_zero/CMakeLists.txt
@@ -75,6 +75,8 @@ add_library(pi_level_zero SHARED
   "${sycl_inc_dir}/CL/sycl/detail/pi.h"
   "${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.hpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.hpp"
 )
 
 if (MSVC)
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 4ed30fdbc99f5..6ac436a6497c1 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -22,6 +22,8 @@
 
 #include <level_zero/zet_api.h>
 
+#include "usm_allocator.hpp"
+
 namespace {
 
 // Controls Level Zero calls serialization to w/a Level Zero driver being not MT
@@ -1491,10 +1493,16 @@ pi_result piContextRelease(pi_context Context) {
 
   assert(Context);
   if (--(Context->RefCount) == 0) {
+    auto ZeContext = Context->ZeContext;
     // Destroy the command list used for initializations
     ZE_CALL(zeCommandListDestroy(Context->ZeCommandListInit));
-    ZE_CALL(zeContextDestroy(Context->ZeContext));
     delete Context;
+
+    // Destruction of some members of pi_context uses L0 context
+    // and therefore it must be valid at that point.
+    // Technically it should be placed to the destructor of pi_context
+    // but this makes API error handling more complex.
+    ZE_CALL(zeContextDestroy(ZeContext));
   }
   return PI_SUCCESS;
 }
@@ -4052,7 +4060,6 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program,
 pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
                             pi_usm_mem_properties *Properties, size_t Size,
                             pi_uint32 Alignment) {
-
   assert(Context);
   // Check that incorrect bits are not set in the properties.
   assert(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)));
@@ -4066,11 +4073,17 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
-                              pi_device Device,
-                              pi_usm_mem_properties *Properties, size_t Size,
-                              pi_uint32 Alignment) {
+static bool ShouldUseUSMAllocator() {
+  // Enable allocator by default if it's not explicitly disabled
+  return std::getenv("SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR") == nullptr;
+}
+
+static const bool UseUSMAllocator = ShouldUseUSMAllocator();
 
+pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
+                             pi_device Device,
+                             pi_usm_mem_properties *Properties, size_t Size,
+                             pi_uint32 Alignment) {
   assert(Context);
   assert(Device);
   // Check that incorrect bits are not set in the properties.
@@ -4086,11 +4099,10 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
-                              pi_device Device,
-                              pi_usm_mem_properties *Properties, size_t Size,
-                              pi_uint32 Alignment) {
-
+pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
+                             pi_device Device,
+                             pi_usm_mem_properties *Properties, size_t Size,
+                             pi_uint32 Alignment) {
   assert(Context);
   assert(Device);
   // Check that incorrect bits are not set in the properties.
@@ -4108,11 +4120,170 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result piextUSMFree(pi_context Context, void *Ptr) {
+pi_result USMFreeImpl(pi_context Context, void *Ptr) {
   ZE_CALL(zeMemFree(Context->ZeContext, Ptr));
   return PI_SUCCESS;
 }
 
+// Exception type to pass allocation errors
+class UsmAllocationException {
+  const pi_result Error;
+
+public:
+  UsmAllocationException(pi_result Err) : Error{Err} {}
+  pi_result getError() const { return Error; }
+};
+
+pi_result USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                             pi_uint32 Alignment) {
+  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+                            Alignment);
+}
+
+pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                             pi_uint32 Alignment) {
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+                            Alignment);
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
+  if (Res != PI_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+
+  return Ptr;
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, Alignment);
+  if (Res != PI_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+  return Ptr;
+}
+
+void USMMemoryAllocBase::deallocate(void *Ptr) {
+  auto Res = USMFreeImpl(Context, Ptr);
+  if (Res != PI_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+}
+
+pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
+                              pi_device Device,
+                              pi_usm_mem_properties *Properties, size_t Size,
+                              pi_uint32 Alignment) {
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    return USMDeviceAllocImpl(ResultPtr, Context, Device, Properties, Size,
+                              Alignment);
+  }
+
+  try {
+    auto It = Context->DeviceMemAllocContexts.find(Device);
+    if (It == Context->DeviceMemAllocContexts.end())
+      return PI_INVALID_VALUE;
+
+    *ResultPtr = It->second.allocate(Size, Alignment);
+  } catch (const UsmAllocationException &Ex) {
+    *ResultPtr = nullptr;
+    return Ex.getError();
+  }
+
+  return PI_SUCCESS;
+}
+
+pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
+                              pi_device Device,
+                              pi_usm_mem_properties *Properties, size_t Size,
+                              pi_uint32 Alignment) {
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    return USMSharedAllocImpl(ResultPtr, Context, Device, Properties, Size,
+                              Alignment);
+  }
+
+  try {
+    auto It = Context->SharedMemAllocContexts.find(Device);
+    if (It == Context->SharedMemAllocContexts.end())
+      return PI_INVALID_VALUE;
+
+    *ResultPtr = It->second.allocate(Size, Alignment);
+  } catch (const UsmAllocationException &Ex) {
+    *ResultPtr = nullptr;
+    return Ex.getError();
+  }
+
+  return PI_SUCCESS;
+}
+
+pi_result piextUSMFree(pi_context Context, void *Ptr) {
+  if (!UseUSMAllocator) {
+    return USMFreeImpl(Context, Ptr);
+  }
+
+  // Query the device of the allocation to determine the right allocator context
+  ze_device_handle_t ZeDeviceHandle;
+  ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
+
+  // Query memory type of the pointer we're freeing to determine the correct
+  // way to do it(directly or via the allocator)
+  ZE_CALL(zeMemGetAllocProperties(
+      Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, &ZeDeviceHandle));
+
+  // TODO: when support for multiple devices is implemented, here
+  // we should do the following:
+  // - Find pi_device instance corresponding to ZeDeviceHandle we've just got if
+  // exist
+  // - Use that pi_device to find the right allocator context and free the
+  // pointer.
+
+  // The allocation doesn't belong to any device for which USM allocator is
+  // enabled.
+  if (Context->Device->ZeDevice != ZeDeviceHandle) {
+    return USMFreeImpl(Context, Ptr);
+  }
+
+  auto DeallocationHelper =
+      [Context,
+       Ptr](std::unordered_map<pi_device, USMAllocContext> &AllocContextMap) {
+        try {
+          auto It = AllocContextMap.find(Context->Device);
+          if (It == AllocContextMap.end())
+            return PI_INVALID_VALUE;
+
+          // The right context is found, deallocate the pointer
+          It->second.deallocate(Ptr);
+        } catch (const UsmAllocationException &Ex) {
+          return Ex.getError();
+        }
+
+        return PI_SUCCESS;
+      };
+
+  switch (ZeMemoryAllocationProperties.type) {
+  case ZE_MEMORY_TYPE_SHARED:
+    return DeallocationHelper(Context->SharedMemAllocContexts);
+  case ZE_MEMORY_TYPE_DEVICE:
+    return DeallocationHelper(Context->DeviceMemAllocContexts);
+  default:
+    // Handled below
+    break;
+  }
+  return USMFreeImpl(Context, Ptr);
+}
+
 pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
                                    size_t ArgSize, const void *ArgValue) {
 
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 889238d15d0d9..682c9f3195a23 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -32,6 +32,8 @@
 
 #include <level_zero/ze_api.h>
 
+#include "usm_allocator.hpp"
+
 template <class To, class From> To pi_cast(From Value) {
   // TODO: see if more sanity checks are possible.
   assert(sizeof(From) == sizeof(To));
@@ -89,6 +91,46 @@ struct _pi_platform {
   std::atomic<int> ZeGlobalCommandListCount{0};
 };
 
+// Implements memory allocation via L0 RT for USM allocator interface.
+class USMMemoryAllocBase : public SystemMemory {
+protected:
+  pi_context Context;
+  pi_device Device;
+  // Internal allocation routine which must be implemented for each allocation
+  // type
+  virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
+                                 pi_uint32 Alignment) = 0;
+
+public:
+  USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
+      : Context{Ctx}, Device{Dev} {}
+  void *allocate(size_t Size) override final;
+  void *allocate(size_t Size, size_t Alignment) override final;
+  void deallocate(void *Ptr) override final;
+};
+
+// Allocation routines for shared memory type
+class USMSharedMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  pi_result allocateImpl(void **ResultPtr, size_t Size,
+                         pi_uint32 Alignment) override;
+
+public:
+  USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for device memory type
+class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  pi_result allocateImpl(void **ResultPtr, size_t Size,
+                         pi_uint32 Alignment) override;
+
+public:
+  USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
 struct _pi_device : _pi_object {
   _pi_device(ze_device_handle_t Device, pi_platform Plt,
              bool isSubDevice = false)
@@ -145,7 +187,19 @@ struct _pi_device : _pi_object {
 struct _pi_context : _pi_object {
   _pi_context(pi_device Device)
       : Device{Device}, ZeCommandListInit{nullptr}, ZeEventPool{nullptr},
-        NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {}
+        NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {
+    // TODO: when support for multiple devices is added, here we should
+    // loop over all the devices and initialize allocator context for each
+    // pair (context, device)
+    SharedMemAllocContexts.emplace(
+        std::piecewise_construct, std::make_tuple(Device),
+        std::make_tuple(std::unique_ptr<SystemMemory>(
+            new USMSharedMemoryAlloc(this, Device))));
+    DeviceMemAllocContexts.emplace(
+        std::piecewise_construct, std::make_tuple(Device),
+        std::make_tuple(std::unique_ptr<SystemMemory>(
+            new USMDeviceMemoryAlloc(this, Device))));
+  }
 
   // A L0 context handle is primarily used during creation and management of
   // resources that may be used by multiple devices.
@@ -174,6 +228,12 @@ struct _pi_context : _pi_object {
   // and destroy the pool if there are no alive events.
   ze_result_t decrementAliveEventsInPool(ze_event_pool_handle_t pool);
 
+  // Store USM allocator context(internal allocator structures)
+  // for USM shared/host and device allocations. There is 1 allocator context
+  // per each pair of (context, device) per each memory type.
+  std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
+  std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
+
 private:
   // Following member variables are used to manage assignment of events
   // to event pools.
diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
new file mode 100644
index 0000000000000..f10e45c8558d5
--- /dev/null
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -0,0 +1,505 @@
+//===---------- usm_allocator.cpp - Allocator for USM memory --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "usm_allocator.hpp"
+
+namespace settings {
+// Size of the slab which is going to be requested from the system.
+static constexpr size_t SlabSize = 64 * 1024; // 64K
+// The largest size which is allocated via the allocator.
+// Allocations with size > CutOff bypass the USM allocator and
+// go directly to the runtime.
+static constexpr size_t CutOff = SlabSize / 2;
+
+// Unfortunately we cannot deduce the size of the array, so every change
+// to the number of buckets should be reflected here.
+using BucketsArrayType = std::array<size_t, 21>;
+
+// Generates a list of bucket sizes used by the allocator.
+static constexpr BucketsArrayType generateBucketSizes() {
+
+// In order to make bucket sizes constexpr simply write
+// them all. There are some restrictions that doesn't
+// allow to write this in a nicer way.
+
+// Simple helper to compute power of 2
+#define P(n) (1 << n)
+
+  BucketsArrayType Sizes = {32,    48,
+                            64,    96,
+                            128,   192,
+                            P(8),  P(8) + P(7),
+                            P(9),  P(9) + P(8),
+                            P(10), P(10) + P(9),
+                            P(11), P(11) + P(10),
+                            P(12), P(12) + P(11),
+                            P(13), P(13) + P(12),
+                            P(14), P(14) + P(13),
+                            CutOff};
+#undef P
+
+  return Sizes;
+}
+
+static constexpr BucketsArrayType BucketSizes = generateBucketSizes();
+
+// The implementation expects that SlabSize is 2^n
+static_assert((SlabSize & (SlabSize - 1)) == 0,
+              "SlabSize must be a power of 2");
+} // namespace settings
+
+// Aligns the pointer down to the specified alignment
+// (e.g. returns 8 for Size = 13, Alignment = 8)
+static void *AlignPtrDown(void *Ptr, const size_t Alignment) {
+  return reinterpret_cast<void *>((reinterpret_cast<size_t>(Ptr)) &
+                                  (~(Alignment - 1)));
+}
+
+// Aligns the pointer up to the specified alignment
+// (e.g. returns 16 for Size = 13, Alignment = 8)
+static void *AlignPtrUp(void *Ptr, const size_t Alignment) {
+  return static_cast<char *>(AlignPtrDown(Ptr, Alignment)) + Alignment;
+}
+
+// Aligns the value up to the specified alignment
+// (e.g. returns 16 for Size = 13, Alignment = 8)
+static size_t AlignUp(size_t Val, size_t Alignment) {
+  assert(Alignment > 0);
+  return (Val + Alignment - 1) & (~(Alignment - 1));
+}
+
+class Bucket;
+
+// Represents the allocated memory block of size 'settings::SlabSize'
+// Internally, it splits the memory block into chunks. The number of
+// chunks depends of the size of a Bucket which created the Slab.
+// The chunks
+// Note: Bucket's method are responsible for thread safety of Slab access,
+// so no locking happens here.
+class Slab {
+
+  // Pointer to the allocated memory of SlabSize bytes
+  void *MemPtr;
+
+  // Represents the current state of each chunk:
+  // if the bit is set then the chunk is allocated
+  // the chunk is free for allocation otherwise
+  std::vector<bool> Chunks;
+
+  // Total number of allocated chunks at the moment.
+  size_t NumAllocated;
+
+  // The bucket which the slab belongs to
+  Bucket &bucket;
+
+  using ListIter = std::list<std::unique_ptr<Slab>>::iterator;
+
+  // Store iterator to the corresponding node in avail/unavail list
+  // to achieve O(1) removal
+  ListIter SlabListIter;
+
+  // Hints where to start search for free chunk in a slab
+  size_t FirstFreeChunkIdx;
+
+  // Return the index of the first available chunk, -1 otherwize
+  size_t FindFirstAvailableChunkIdx() const;
+
+  // Register/Unregister the slab in the global slab address map.
+  static void regSlab(Slab &);
+  static void unregSlab(Slab &);
+  static void regSlabByAddr(void *, Slab &);
+  static void unregSlabByAddr(void *, Slab &);
+
+public:
+  Slab(Bucket &);
+  ~Slab();
+
+  void setIterator(ListIter It) { SlabListIter = It; }
+  ListIter getIterator() const { return SlabListIter; }
+
+  size_t getNumAllocated() const { return NumAllocated; }
+
+  void *getFreeChunk();
+
+  void *getPtr() const { return MemPtr; }
+  void *getEnd() const {
+    return static_cast<char *>(getPtr()) + settings::SlabSize;
+  }
+
+  size_t getChunkSize() const;
+  size_t getNumChunks() const { return Chunks.size(); }
+
+  bool hasAvail();
+
+  Bucket &getBucket();
+
+  void freeChunk(void *Ptr);
+};
+
+class Bucket {
+  const size_t Size;
+
+  // List of slabs which have at least 1 available chunk.
+  std::list<std::unique_ptr<Slab>> AvailableSlabs;
+
+  // List of slabs with 0 available chunk.
+  std::list<std::unique_ptr<Slab>> UnavailableSlabs;
+
+  // Protects the bucket and all the corresponding slabs
+  std::mutex BucketLock;
+
+  // Reference to the allocator context, used access memory allocation
+  // routines, slab map and etc.
+  USMAllocContext::USMAllocImpl &OwnAllocCtx;
+
+public:
+  Bucket(size_t Sz, USMAllocContext::USMAllocImpl &AllocCtx)
+      : Size{Sz}, OwnAllocCtx{AllocCtx} {}
+
+  void *getChunk();
+
+  size_t getSize() const { return Size; }
+
+  void freeChunk(void *Ptr, Slab &Slab);
+  SystemMemory &getMemHandle();
+  USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; }
+
+private:
+  void onFreeChunk(Slab &);
+  decltype(AvailableSlabs.begin()) getAvailSlab();
+};
+
+class USMAllocContext::USMAllocImpl {
+  // It's important for the map to be destroyed last after buckets and their
+  // slabs This is because slab's destructor removes the object from the map.
+  std::unordered_multimap<void *, Slab &> KnownSlabs;
+  std::shared_timed_mutex KnownSlabsMapLock;
+
+  // Handle to the memory allocation routine
+  std::unique_ptr<SystemMemory> MemHandle;
+
+  // Store as unique_ptrs since Bucket is not Movable(because of std::mutex)
+  std::vector<std::unique_ptr<Bucket>> Buckets;
+
+public:
+  USMAllocImpl(std::unique_ptr<SystemMemory> SystemMemHandle)
+      : MemHandle{std::move(SystemMemHandle)} {
+
+    Buckets.reserve(settings::BucketSizes.size());
+
+    for (auto &&Size : settings::BucketSizes) {
+      Buckets.emplace_back(std::make_unique<Bucket>(Size, *this));
+    }
+  }
+
+  void *allocate(size_t Size, size_t Alignment);
+  void *allocate(size_t Size);
+  void deallocate(void *Ptr);
+
+  SystemMemory &getMemHandle() { return *MemHandle; }
+
+  std::shared_timed_mutex &getKnownSlabsMapLock() { return KnownSlabsMapLock; }
+  std::unordered_multimap<void *, Slab &> &getKnownSlabs() {
+    return KnownSlabs;
+  }
+
+private:
+  Bucket &findBucket(size_t Size);
+};
+
+bool operator==(const Slab &Lhs, const Slab &Rhs) {
+  return Lhs.getPtr() == Rhs.getPtr();
+}
+
+std::ostream &operator<<(std::ostream &Os, const Slab &Slab) {
+  Os << "Slab<" << Slab.getPtr() << ", " << Slab.getEnd() << ">";
+  return Os;
+}
+
+Slab::Slab(Bucket &Bkt)
+    : MemPtr(Bkt.getMemHandle().allocate(settings::SlabSize)),
+      // In case if bucket size is not that SlabSize % b.getSize() == 0, we
+      // would have some padding at the end of the slab.
+      Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{},
+      bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} {
+
+  regSlab(*this);
+}
+
+Slab::~Slab() {
+  unregSlab(*this);
+  bucket.getMemHandle().deallocate(MemPtr);
+}
+
+// Return the index of the first available chunk, -1 otherwize
+size_t Slab::FindFirstAvailableChunkIdx() const {
+  // Use the first free chunk index as a hint for the search.
+  auto It = std::find_if(Chunks.begin() + FirstFreeChunkIdx, Chunks.end(),
+                         [](auto x) { return !x; });
+  if (It != Chunks.end()) {
+    return It - Chunks.begin();
+  }
+
+  return static_cast<size_t>(-1);
+}
+
+void *Slab::getFreeChunk() {
+  assert(NumAllocated != Chunks.size());
+
+  const size_t ChunkIdx = FindFirstAvailableChunkIdx();
+  // Free chunk must exist, otherwise we would have allocated another slab
+  assert(ChunkIdx != (static_cast<size_t>(-1)));
+
+  void *const FreeChunk =
+      (static_cast<uint8_t *>(getPtr())) + ChunkIdx * getChunkSize();
+  Chunks[ChunkIdx] = true;
+  NumAllocated += 1;
+
+  // Use the found index as the next hint
+  FirstFreeChunkIdx = ChunkIdx;
+
+  return FreeChunk;
+}
+
+Bucket &Slab::getBucket() { return bucket; }
+
+size_t Slab::getChunkSize() const { return bucket.getSize(); }
+
+void Slab::regSlabByAddr(void *Addr, Slab &Slab) {
+  auto &Lock = Slab.getBucket().getUsmAllocCtx().getKnownSlabsMapLock();
+  auto &Map = Slab.getBucket().getUsmAllocCtx().getKnownSlabs();
+
+  std::lock_guard<std::shared_timed_mutex> Lg(Lock);
+  Map.insert({Addr, Slab});
+}
+
+void Slab::unregSlabByAddr(void *Addr, Slab &Slab) {
+  auto &Lock = Slab.getBucket().getUsmAllocCtx().getKnownSlabsMapLock();
+  auto &Map = Slab.getBucket().getUsmAllocCtx().getKnownSlabs();
+
+  std::lock_guard<std::shared_timed_mutex> Lg(Lock);
+
+  auto Slabs = Map.equal_range(Addr);
+  // At least the must get the current slab from the map.
+  assert(Slabs.first != Slabs.second && "Slab is not found");
+
+  for (auto It = Slabs.first; It != Slabs.second; ++It) {
+    if (It->second == Slab) {
+      Map.erase(It);
+      return;
+    }
+  }
+
+  assert(false && "Slab is not found");
+}
+
+void Slab::regSlab(Slab &Slab) {
+  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize);
+  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabSize;
+
+  regSlabByAddr(StartAddr, Slab);
+  regSlabByAddr(EndAddr, Slab);
+}
+
+void Slab::unregSlab(Slab &Slab) {
+  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize);
+  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabSize;
+
+  unregSlabByAddr(StartAddr, Slab);
+  unregSlabByAddr(EndAddr, Slab);
+}
+
+void Slab::freeChunk(void *Ptr) {
+  // This method should be called through bucket(since we might remove the slab
+  // as a result), therefore all locks are done on that level.
+
+  // Make sure that we're in the right slab
+  assert(Ptr >= getPtr() && Ptr < getEnd());
+
+  // Even if the pointer p was previously aligned, it's still inside the
+  // corresponding chunk, so we get the correct index here.
+  auto ChunkIdx =
+      (static_cast<char *>(Ptr) - static_cast<char *>(MemPtr)) / getChunkSize();
+
+  // Make sure that the chunk was allocated
+  assert(Chunks[ChunkIdx] && "double free detected");
+
+  Chunks[ChunkIdx] = false;
+  NumAllocated -= 1;
+
+  if (ChunkIdx < FirstFreeChunkIdx)
+    FirstFreeChunkIdx = ChunkIdx;
+}
+
+bool Slab::hasAvail() { return NumAllocated != getNumChunks(); }
+
+auto Bucket::getAvailSlab() -> decltype(AvailableSlabs.begin()) {
+  if (AvailableSlabs.size() == 0) {
+    auto It = AvailableSlabs.insert(AvailableSlabs.begin(),
+                                    std::make_unique<Slab>(*this));
+    (*It)->setIterator(It);
+  }
+
+  return AvailableSlabs.begin();
+}
+
+void *Bucket::getChunk() {
+  std::lock_guard<std::mutex> Lg(BucketLock);
+
+  auto SlabIt = getAvailSlab();
+  auto *FreeChunk = (*SlabIt)->getFreeChunk();
+
+  // If the slab is full, move it to unavailable slabs and update its itreator
+  if (!((*SlabIt)->hasAvail())) {
+    auto It =
+        UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt));
+    AvailableSlabs.erase(SlabIt);
+    (*It)->setIterator(It);
+  }
+
+  return FreeChunk;
+}
+
+void Bucket::freeChunk(void *Ptr, Slab &Slab) {
+  std::lock_guard<std::mutex> Lg(BucketLock);
+
+  Slab.freeChunk(Ptr);
+
+  onFreeChunk(Slab);
+}
+
+// The lock must be acquired before calling this method
+void Bucket::onFreeChunk(Slab &Slab) {
+  // In case if the slab was previously full and now has 1 available
+  // chunk, it should be moved to the list of available slabs
+  if (Slab.getNumAllocated() == (Slab.getNumChunks() - 1)) {
+    auto SlabIter = Slab.getIterator();
+    assert(SlabIter != UnavailableSlabs.end());
+
+    auto It =
+        AvailableSlabs.insert(AvailableSlabs.begin(), std::move(*SlabIter));
+    UnavailableSlabs.erase(SlabIter);
+
+    (*It)->setIterator(It);
+  }
+
+  // Remove the slab when all the chunks from it are deallocated
+  // Note: since the slab is stored as unique_ptr, just remove it from
+  // the list to remove the list to destroy the object
+  if (Slab.getNumAllocated() == 0) {
+    auto It = Slab.getIterator();
+    assert(It != AvailableSlabs.end());
+
+    AvailableSlabs.erase(It);
+  }
+}
+
+SystemMemory &Bucket::getMemHandle() { return OwnAllocCtx.getMemHandle(); }
+
+void *USMAllocContext::USMAllocImpl::allocate(size_t Size) {
+  if (Size == 0)
+    return nullptr;
+
+  if (Size > settings::CutOff)
+    return getMemHandle().allocate(Size);
+
+  return findBucket(Size).getChunk();
+}
+
+void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) {
+  if (Size == 0)
+    return nullptr;
+
+  if (Alignment <= 1)
+    return allocate(Size);
+
+  size_t AlignedSize = (Size > 1) ? AlignUp(Size, Alignment) : Alignment;
+
+  // Check if our largest chunk is able to fit aligned size.
+  // If not, just request aligned pointer from the system.
+  if (AlignedSize > settings::CutOff) {
+    return getMemHandle().allocate(Size, Alignment);
+  }
+
+  auto *Ptr = findBucket(AlignedSize).getChunk();
+  return AlignPtrUp(Ptr, Alignment);
+}
+
+Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) {
+  assert(Size <= settings::CutOff && "Unexpected size");
+
+  auto It = std::find_if(
+      Buckets.begin(), Buckets.end(),
+      [Size](const auto &BucketPtr) { return BucketPtr->getSize() >= Size; });
+
+  assert((It != Buckets.end()) && "Bucket should always exist");
+
+  return *(*It);
+}
+
+void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) {
+  auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabSize);
+
+  // Lock the map on read
+  std::shared_lock<std::shared_timed_mutex> Lk(getKnownSlabsMapLock());
+
+  auto Slabs = getKnownSlabs().equal_range(SlabPtr);
+  if (Slabs.first == Slabs.second) {
+    Lk.unlock();
+    getMemHandle().deallocate(Ptr);
+    return;
+  }
+
+  for (auto It = Slabs.first; It != Slabs.second; ++It) {
+    // The slab object won't be deleted until it's removed from the map which is
+    // protected by the lock, so it's safe to access it here.
+    auto &Slab = It->second;
+    if (Ptr >= Slab.getPtr() && Ptr < Slab.getEnd()) {
+
+      // Unlock the map before freeing the chunk, it may be locked on write
+      // there
+      Lk.unlock();
+      auto &Bucket = Slab.getBucket();
+      Bucket.freeChunk(Ptr, Slab);
+      return;
+    }
+  }
+
+  Lk.unlock();
+  // There is a rare case when we have a pointer from system allocation next
+  // to some slab with an entry in the map. So we find a slab
+  // but the range checks fail.
+  getMemHandle().deallocate(Ptr);
+}
+
+USMAllocContext::USMAllocContext(std::unique_ptr<SystemMemory> MemHandle)
+    : pImpl(std::make_unique<USMAllocImpl>(std::move(MemHandle))) {}
+
+void *USMAllocContext::allocate(size_t size) { return pImpl->allocate(size); }
+
+void *USMAllocContext::allocate(size_t size, size_t alignment) {
+  return pImpl->allocate(size, alignment);
+}
+
+void USMAllocContext::deallocate(void *ptr) { return pImpl->deallocate(ptr); }
+
+// Define destructor for its usage with unique_ptr
+USMAllocContext::~USMAllocContext() = default;
diff --git a/sycl/plugins/level_zero/usm_allocator.hpp b/sycl/plugins/level_zero/usm_allocator.hpp
new file mode 100644
index 0000000000000..b72ca77d41538
--- /dev/null
+++ b/sycl/plugins/level_zero/usm_allocator.hpp
@@ -0,0 +1,39 @@
+//===---------- usm_allocator.hpp - Allocator for USM memory --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef USM_ALLOCATOR
+#define USM_ALLOCATOR
+
+#include <memory>
+
+// USM system memory allocation/deallocation interface.
+class SystemMemory {
+public:
+  virtual void *allocate(size_t size) = 0;
+  virtual void *allocate(size_t size, size_t aligned) = 0;
+  virtual void deallocate(void *ptr) = 0;
+  virtual ~SystemMemory() = default;
+};
+
+class USMAllocContext {
+public:
+  // Keep it public since it needs to be accessed by the lower layer(Buckets)
+  class USMAllocImpl;
+
+  USMAllocContext(std::unique_ptr<SystemMemory> memHandle);
+  ~USMAllocContext();
+
+  void *allocate(size_t size);
+  void *allocate(size_t size, size_t alignment);
+  void deallocate(void *ptr);
+
+private:
+  std::unique_ptr<USMAllocImpl> pImpl;
+};
+
+#endif
diff --git a/sycl/test/usm/dmem_varied.cpp b/sycl/test/usm/dmem_varied.cpp
new file mode 100644
index 0000000000000..5f8b34a51d6b3
--- /dev/null
+++ b/sycl/test/usm/dmem_varied.cpp
@@ -0,0 +1,115 @@
+// XFAIL: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER  %t1.out
+
+//==---------- dmem_varied.cpp - Test various sizes and alignments ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <thread>
+#include <vector>
+
+using namespace cl::sycl;
+
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_device_allocations>() ||
+      !dev.get_info<info::device::usm_shared_allocations>()) {
+    return 0;
+  }
+
+  // Check allocation on small sizes and a large one, For each allocation
+  // also check several different alignments.
+  // To verify on device, store a valu per each pointer and calculate
+  // the sum on device and then check it.
+
+  constexpr size_t smallSizeLimit = 256;
+  constexpr size_t alignmentLimit = 64;
+  constexpr size_t largeSize = 128 * 1024; // 128k
+
+  // 1000 should be enough to store all the allocated pointers
+  constexpr size_t numPtrs = 1000;
+
+  // Allocate as shared, this memory is used to store device pointers on host
+  // and pass them to device
+  uint8_t **ptrs = (uint8_t **)malloc_shared(numPtrs * sizeof(uint8_t *), q);
+  assert(ptrs != nullptr);
+  memset(ptrs, 0, numPtrs * sizeof(uint8_t *));
+
+  size_t count = 0;
+
+  // Small sizes to allocate
+  // Allocated sizes 2^n - 1 up to smallSizeLimit
+  for (size_t size = 2; size <= smallSizeLimit; size *= 2) {
+    uint8_t *p = (uint8_t *)malloc_device(size - 1, q);
+    assert(p != nullptr);
+    ptrs[count++] = p;
+
+    for (size_t alignment = 1; alignment <= alignmentLimit && alignment < size;
+         alignment *= 2) {
+      uint8_t *s = (uint8_t *)aligned_alloc_device(alignment, size - 1, q);
+      assert(s != nullptr);
+      assert(((size_t)s) % alignment == 0);
+
+      ptrs[count++] = s;
+    }
+  }
+
+  ptrs[count] = (uint8_t *)malloc_device(largeSize, q);
+  assert(ptrs[count]);
+  count++;
+
+  for (size_t alignment = 1; alignment <= alignmentLimit; alignment *= 8) {
+    uint8_t *a = (uint8_t *)aligned_alloc_device(alignment, largeSize, q);
+    assert(a);
+    assert(((size_t)a) % alignment == 0);
+
+    ptrs[count++] = a;
+  }
+
+  q.submit([&](handler &h) {
+    h.single_task<class foo1>([=]() {
+      for (size_t i = 0; i < count; ++i) {
+        *ptrs[i] = 1;
+      }
+    });
+  });
+
+  size_t *res =
+      (size_t *)aligned_alloc_shared(alignof(size_t), sizeof(size_t), q);
+  assert(res);
+  assert(((size_t)res) % alignof(size_t) == 0);
+  *res = 0;
+
+  q.submit([&](handler &h) {
+     h.single_task<class foo>([=]() {
+       for (size_t i = 0; i < count; ++i) {
+         *res += *ptrs[i];
+       }
+     });
+   }).wait();
+
+  assert(*res == count);
+
+  for (size_t i = 0; i < numPtrs; ++i) {
+    if (ptrs[i] != nullptr) {
+      free(ptrs[i], q);
+    }
+  }
+
+  free(res, q);
+  free(ptrs, q);
+
+  return 0;
+}
diff --git a/sycl/test/usm/smem_concurrent.cpp b/sycl/test/usm/smem_concurrent.cpp
new file mode 100644
index 0000000000000..ce0947eaa14f6
--- /dev/null
+++ b/sycl/test/usm/smem_concurrent.cpp
@@ -0,0 +1,123 @@
+// XFAIL: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER  %t1.out
+
+//==------ smem_concurrent.cpp - Concurrent USM allocation test -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <vector>
+
+using namespace cl::sycl;
+
+const int N = 8;
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_shared_allocations>()) {
+    return 1;
+  }
+
+  // This test checks that we're able to allocate/deallocate shared memory while
+  // the kernel is running, but we don't actually access such memory on the
+  // host.
+  constexpr size_t allocSize = 100;
+  constexpr size_t numAllocs = 6;
+
+  // Allocate "host" and "device" arrays of pointers.
+  uint8_t **hostPtrs =
+      (uint8_t **)malloc_shared(numAllocs * sizeof(uint8_t *), q);
+  uint8_t **devicePtrs =
+      (uint8_t **)malloc_shared(numAllocs * sizeof(uint8_t *), q);
+
+  // Fill "device" array with pointers to memory allocated with malloc_shared
+  for (size_t idx = 0; idx < numAllocs; ++idx) {
+    uint8_t *p = (uint8_t *)malloc_shared(allocSize, q);
+    *p = 1;
+    devicePtrs[idx] = p;
+  }
+
+  // Fill first halft of "host" array with pointers to memory allocated with
+  // malloc_shared. This part will be freed later.
+  for (size_t idx = 0; idx < numAllocs / 2; ++idx) {
+    uint8_t *p = (uint8_t *)malloc_shared(allocSize, q);
+    *p = 1;
+    hostPtrs[idx] = p;
+  }
+
+  // Allocate a memory to store the result of computation.
+  uint8_t *res = (uint8_t *)malloc_shared(1, q);
+  *res = 0;
+
+  // Run computation on device using "device" array
+  auto e = q.submit([&](handler &h) {
+    h.single_task<class foo>([res, devicePtrs]() {
+      for (size_t i = 0; i < numAllocs; ++i) {
+        *res += *(uint8_t *)devicePtrs[i];
+      }
+    });
+  });
+
+  // While running the computation kernel,
+  // free first half of "host" array
+  for (size_t i = 0; i < numAllocs / 2; ++i) {
+    free(hostPtrs[i], q);
+    hostPtrs[i] = nullptr;
+  }
+
+  // And then fill the second part of array with pointers allocated via
+  // malloc_shared.
+  for (size_t i = numAllocs / 2; i < numAllocs; ++i) {
+    uint8_t *p = (uint8_t *)malloc_shared(allocSize, q);
+    *p = 1;
+    hostPtrs[i] = p;
+  }
+
+  e.wait();
+
+  // After the kernel is finished update the computation result
+  // with data from "host" array of ptrs.
+  for (size_t i = 0; i < numAllocs; ++i) {
+    if (hostPtrs[i] == nullptr) {
+      *res += 2;
+    } else {
+      *res += *(uint8_t *)hostPtrs[i];
+    }
+  }
+
+  // Check the result
+  // +1 for each element in "device" array
+  // +2 for each freed "host" array ptr
+  // +1 for each allocated "host" array ptr
+  //
+  // total = 1 * numAllocs + numAllocs / 2 * 2 + numAllocs / 2
+  assert(*res == (numAllocs * 2 + numAllocs / 2));
+
+  for (size_t i = 0; i < numAllocs; ++i) {
+    if (devicePtrs[i] != nullptr) {
+      free(devicePtrs[i], q);
+    }
+    if (hostPtrs[i] != nullptr) {
+      free(hostPtrs[i], q);
+    }
+  }
+
+  free(res, q);
+  free(devicePtrs, q);
+  free(hostPtrs, q);
+
+  return 0;
+}
diff --git a/sycl/test/usm/smem_varied.cpp b/sycl/test/usm/smem_varied.cpp
new file mode 100644
index 0000000000000..02446a2ee2855
--- /dev/null
+++ b/sycl/test/usm/smem_varied.cpp
@@ -0,0 +1,108 @@
+// XFAIL: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out -DTEST_SHARED
+// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER  %t1.out
+
+//==---------- smem_varied.cpp - Test various sizes and alignments ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <thread>
+#include <vector>
+
+using namespace cl::sycl;
+
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_shared_allocations>()) {
+    return 0;
+  }
+
+  // Check allocation on small sizes and a large one, For each allocation
+  // also check several different alignments.
+  // To verify on device, store a valu per each pointer and calculate
+  // the sum on device and then check it.
+
+  constexpr size_t smallSizeLimit = 256;
+  constexpr size_t alignmentLimit = 64;
+  constexpr size_t largeSize = 128 * 1024; // 128k
+
+  // 1000 should be enough to store all the allocated pointers
+  constexpr size_t numPtrs = 1000;
+
+  uint8_t **ptrs = (uint8_t **)malloc_shared(numPtrs * sizeof(uint8_t *), q);
+  assert(ptrs != nullptr);
+  memset(ptrs, 0, numPtrs * sizeof(uint8_t *));
+
+  size_t count = 0;
+
+  // Small sizes to allocate
+  // Allocated sizes 2^n - 1 up to smallSizeLimit
+  for (size_t size = 2; size <= smallSizeLimit; size *= 2) {
+    uint8_t *p = (uint8_t *)malloc_shared(size - 1, q);
+    assert(p != nullptr);
+    *p = 1;
+    ptrs[count++] = p;
+
+    for (size_t alignment = 1; alignment <= alignmentLimit && alignment < size;
+         alignment *= 2) {
+      uint8_t *s = (uint8_t *)aligned_alloc_shared(alignment, size - 1, q);
+      assert(s != nullptr);
+      assert(((size_t)s) % alignment == 0);
+
+      *s = 1;
+      ptrs[count++] = s;
+    }
+  }
+
+  ptrs[count] = (uint8_t *)malloc_shared(largeSize, q);
+  assert(ptrs[count]);
+  *ptrs[count] = 1;
+  count++;
+
+  for (size_t alignment = 1; alignment <= alignmentLimit; alignment *= 8) {
+    uint8_t *a = (uint8_t *)aligned_alloc_shared(alignment, largeSize, q);
+    assert(a);
+    assert(((size_t)a) % alignment == 0);
+    *a = 1;
+
+    ptrs[count++] = a;
+  }
+
+  size_t *res =
+      (size_t *)aligned_alloc_shared(alignof(size_t), sizeof(size_t), q);
+  assert(res);
+  assert(((size_t)res) % alignof(size_t) == 0);
+  *res = 0;
+
+  q.submit([&](handler &h) {
+     h.single_task<class foo>([=]() {
+       for (size_t i = 0; i < count; ++i) {
+         *res += *ptrs[i];
+       }
+     });
+   }).wait();
+
+  assert(*res == count);
+
+  for (size_t i = 0; i < numPtrs; ++i) {
+    if (ptrs[i] != nullptr) {
+      free(ptrs[i], q);
+    }
+  }
+
+  free(res, q);
+  free(ptrs, q);
+
+  return 0;
+}

From 3cfe30f7c5e75a67ff4126cf7506ed8849586196 Mon Sep 17 00:00:00 2001
From: Ilya Stepykin <ilya.stepykin@intel.com>
Date: Mon, 7 Sep 2020 17:13:22 +0300
Subject: [PATCH 2/5] [SYCL] Restrict USM allocators tests to Level zero

There are issues with running these tests on OpenCL
on windows.
The primary goal of these tests is to test USM allocator
which is currently emabled only for Level zero, so disable
them on OpenCL for now.
---
 sycl/test/usm/dmem_varied.cpp     | 5 ++++-
 sycl/test/usm/smem_concurrent.cpp | 4 ++++
 sycl/test/usm/smem_varied.cpp     | 5 ++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/sycl/test/usm/dmem_varied.cpp b/sycl/test/usm/dmem_varied.cpp
index 5f8b34a51d6b3..134379ef43ed2 100644
--- a/sycl/test/usm/dmem_varied.cpp
+++ b/sycl/test/usm/dmem_varied.cpp
@@ -4,6 +4,10 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER  %t1.out
 
+// This test is expected to reliably work with USM allocator which is
+// currently enabled only on level zero.
+// REQUIRES: level_zero
+
 //==---------- dmem_varied.cpp - Test various sizes and alignments ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -14,7 +18,6 @@
 
 #include <CL/sycl.hpp>
 
-#include <thread>
 #include <vector>
 
 using namespace cl::sycl;
diff --git a/sycl/test/usm/smem_concurrent.cpp b/sycl/test/usm/smem_concurrent.cpp
index ce0947eaa14f6..dea944ebb2091 100644
--- a/sycl/test/usm/smem_concurrent.cpp
+++ b/sycl/test/usm/smem_concurrent.cpp
@@ -4,6 +4,10 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER  %t1.out
 
+// This test is expected to reliably work with USM allocator which is
+// currently enabled only on level zero.
+// REQUIRES: level_zero
+
 //==------ smem_concurrent.cpp - Concurrent USM allocation test -----------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/usm/smem_varied.cpp b/sycl/test/usm/smem_varied.cpp
index 02446a2ee2855..a05ce44ab2a57 100644
--- a/sycl/test/usm/smem_varied.cpp
+++ b/sycl/test/usm/smem_varied.cpp
@@ -4,6 +4,10 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER  %t1.out
 
+// This test is expected to reliably work with USM allocator which is
+// currently enabled only on level zero.
+// REQUIRES: level_zero
+
 //==---------- smem_varied.cpp - Test various sizes and alignments ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -14,7 +18,6 @@
 
 #include <CL/sycl.hpp>
 
-#include <thread>
 #include <vector>
 
 using namespace cl::sycl;

From d03500e9ddc4f0e776d74e029f4fb033531884df Mon Sep 17 00:00:00 2001
From: Ilya Stepykin <ilya.stepykin@intel.com>
Date: Tue, 8 Sep 2020 02:10:49 +0300
Subject: [PATCH 3/5] [SYCL] Fix pointer alignment

Due to incorrect pointer alignment in case when
allcation alignment > size we returned to the user
and incorrect chunk(i.e. next to the one which was
marked as non-free) which is later detected as "double free"
error.

Also corrected the existing testcases to test this scenario
---
 sycl/plugins/level_zero/usm_allocator.cpp | 12 ++++++++++--
 sycl/test/usm/dmem_varied.cpp             |  4 ++--
 sycl/test/usm/smem_varied.cpp             |  4 ++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index f10e45c8558d5..7aed86472fd9b 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -76,7 +76,12 @@ static void *AlignPtrDown(void *Ptr, const size_t Alignment) {
 // Aligns the pointer up to the specified alignment
 // (e.g. returns 16 for Size = 13, Alignment = 8)
 static void *AlignPtrUp(void *Ptr, const size_t Alignment) {
-  return static_cast<char *>(AlignPtrDown(Ptr, Alignment)) + Alignment;
+  void *AlignedPtr = AlignPtrDown(Ptr, Alignment);
+  // Special case when the pointer is already aligned
+  if (Ptr == AlignedPtr) {
+    return Ptr;
+  }
+  return static_cast<char *>(AlignedPtr) + Alignment;
 }
 
 // Aligns the value up to the specified alignment
@@ -150,6 +155,7 @@ class Slab {
   bool hasAvail();
 
   Bucket &getBucket();
+  const Bucket &getBucket() const;
 
   void freeChunk(void *Ptr);
 };
@@ -230,7 +236,8 @@ bool operator==(const Slab &Lhs, const Slab &Rhs) {
 }
 
 std::ostream &operator<<(std::ostream &Os, const Slab &Slab) {
-  Os << "Slab<" << Slab.getPtr() << ", " << Slab.getEnd() << ">";
+  Os << "Slab<" << Slab.getPtr() << ", " << Slab.getEnd() << ", "
+     << Slab.getBucket().getSize() << ">";
   return Os;
 }
 
@@ -280,6 +287,7 @@ void *Slab::getFreeChunk() {
 }
 
 Bucket &Slab::getBucket() { return bucket; }
+const Bucket &Slab::getBucket() const { return bucket; }
 
 size_t Slab::getChunkSize() const { return bucket.getSize(); }
 
diff --git a/sycl/test/usm/dmem_varied.cpp b/sycl/test/usm/dmem_varied.cpp
index 134379ef43ed2..a1af3d157182c 100644
--- a/sycl/test/usm/dmem_varied.cpp
+++ b/sycl/test/usm/dmem_varied.cpp
@@ -59,8 +59,8 @@ int main() {
     assert(p != nullptr);
     ptrs[count++] = p;
 
-    for (size_t alignment = 1; alignment <= alignmentLimit && alignment < size;
-         alignment *= 2) {
+    // Also test cases with alignment > size
+    for (size_t alignment = 1; alignment <= alignmentLimit; alignment *= 2) {
       uint8_t *s = (uint8_t *)aligned_alloc_device(alignment, size - 1, q);
       assert(s != nullptr);
       assert(((size_t)s) % alignment == 0);
diff --git a/sycl/test/usm/smem_varied.cpp b/sycl/test/usm/smem_varied.cpp
index a05ce44ab2a57..9a251e6c5f187 100644
--- a/sycl/test/usm/smem_varied.cpp
+++ b/sycl/test/usm/smem_varied.cpp
@@ -57,8 +57,8 @@ int main() {
     *p = 1;
     ptrs[count++] = p;
 
-    for (size_t alignment = 1; alignment <= alignmentLimit && alignment < size;
-         alignment *= 2) {
+    // Also test cases with alignment > size
+    for (size_t alignment = 1; alignment <= alignmentLimit; alignment *= 2) {
       uint8_t *s = (uint8_t *)aligned_alloc_shared(alignment, size - 1, q);
       assert(s != nullptr);
       assert(((size_t)s) % alignment == 0);

From 0fbffa44ba7c814d87f2fa8d3275bf914322c4a2 Mon Sep 17 00:00:00 2001
From: Ilya Stepykin <ilya.stepykin@intel.com>
Date: Tue, 8 Sep 2020 22:00:03 +0300
Subject: [PATCH 4/5] [SYCL] Minor fix and wording change

---
 sycl/doc/EnvironmentVariables.md          | 2 +-
 sycl/plugins/level_zero/usm_allocator.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index eb8c2f13fb4e9..73fcdc9e767ed 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -26,7 +26,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
-| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in L0 plugin(each memory request will go directly to L0 runtine)  |
+| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin(each memory request will go directly to Level Zero runtine)  |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 7aed86472fd9b..1596bf71bb427 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -110,7 +110,7 @@ class Slab {
   std::vector<bool> Chunks;
 
   // Total number of allocated chunks at the moment.
-  size_t NumAllocated;
+  size_t NumAllocated = 0;
 
   // The bucket which the slab belongs to
   Bucket &bucket;
@@ -122,7 +122,7 @@ class Slab {
   ListIter SlabListIter;
 
   // Hints where to start search for free chunk in a slab
-  size_t FirstFreeChunkIdx;
+  size_t FirstFreeChunkIdx = 0;
 
   // Return the index of the first available chunk, -1 otherwize
   size_t FindFirstAvailableChunkIdx() const;
@@ -245,7 +245,7 @@ Slab::Slab(Bucket &Bkt)
     : MemPtr(Bkt.getMemHandle().allocate(settings::SlabSize)),
       // In case if bucket size is not that SlabSize % b.getSize() == 0, we
       // would have some padding at the end of the slab.
-      Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{},
+      Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{0},
       bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} {
 
   regSlab(*this);

From d9dc92b7d7917035a9bb895c35cf7d57b12283be Mon Sep 17 00:00:00 2001
From: Ilya Stepykin <ilya.stepykin@intel.com>
Date: Wed, 9 Sep 2020 01:09:09 +0300
Subject: [PATCH 5/5] Fix a typo

Co-authored-by: Pavel Chupin <45979248+pvchupin@users.noreply.github.com>
---
 sycl/doc/EnvironmentVariables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 73fcdc9e767ed..0a1dcd0a03198 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -26,7 +26,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
-| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin(each memory request will go directly to Level Zero runtine)  |
+| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`