diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 49e569077f9fd..1f981bdd66dc6 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -1298,7 +1298,8 @@ using pi_usm_migration_flags = _pi_usm_migration_flags;
 /// \param context is the pi_context
 /// \param pi_usm_mem_properties are optional allocation properties
 /// \param size_t is the size of the allocation
-/// \param alignment is the desired alignment of the allocation
+/// \param alignment is the desired alignment of the allocation. 0 indicates no
+///        requirements, and uses the backend default alignment.
 pi_result piextUSMHostAlloc(void **result_ptr, pi_context context,
                             pi_usm_mem_properties *properties, size_t size,
                             pi_uint32 alignment);
@@ -1310,7 +1311,8 @@ pi_result piextUSMHostAlloc(void **result_ptr, pi_context context,
 /// \param device is the device the memory will be allocated on
 /// \param pi_usm_mem_properties are optional allocation properties
 /// \param size_t is the size of the allocation
-/// \param alignment is the desired alignment of the allocation
+/// \param alignment is the desired alignment of the allocation. 0 indicates no
+///        requirements, and uses the backend default alignment.
 pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context,
                               pi_device device,
                               pi_usm_mem_properties *properties, size_t size,
@@ -1323,7 +1325,8 @@ pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context,
 /// \param device is the device the memory will be allocated on
 /// \param pi_usm_mem_properties are optional allocation properties
 /// \param size_t is the size of the allocation
-/// \param alignment is the desired alignment of the allocation
+/// \param alignment is the desired alignment of the allocation. 0 indicates no
+///        requirements, and uses the backend default alignment.
 pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context,
                               pi_device device,
                               pi_usm_mem_properties *properties, size_t size,
@@ -1340,8 +1343,7 @@ pi_result piextUSMFree(pi_context context, void *ptr);
 /// \param queue is the queue to submit to
 /// \param ptr is the ptr to memset
 /// \param value is value to set.  It is interpreted as an 8-bit value and the
-/// upper
-///        24 bits are ignored
+///        upper 24 bits are ignored
 /// \param count is the size in bytes to memset
 /// \param num_events_in_waitlist is the number of events to wait on
 /// \param events_waitlist is an array of events to wait on
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index af15743438da8..dad8abdb28fb1 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -3396,9 +3396,19 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
 pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
                                  pi_usm_mem_properties *properties, size_t size,
                                  pi_uint32 alignment) {
+  // from empirical testing with CUDA 10.2 on a Tesla K40
+  static constexpr pi_uint32 max_alignment = 0x200;
+
+  // enforce a valid pointer to the allocated memory
   assert(result_ptr != nullptr);
+  // check the the context is valid
   assert(context != nullptr);
+  // check that the property list is empty
   assert(properties == nullptr);
+  // check that the alignment is not larger than max_alignment, and is either 0
+  // or a power of 2
+  assert(alignment <= max_alignment && (alignment & (alignment - 1)) == 0);
+
   pi_result result = PI_SUCCESS;
   try {
     ScopedContext active(context);
@@ -3406,7 +3416,9 @@ pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
   } catch (pi_result error) {
     result = error;
   }
-  assert(reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0);
+  // check that the result is suitable aligned
+  assert((alignment == 0) ||
+         (reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
   return result;
 }
 
@@ -3416,10 +3428,21 @@ pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
                                    pi_device device,
                                    pi_usm_mem_properties *properties,
                                    size_t size, pi_uint32 alignment) {
+  // from empirical testing with CUDA 10.2 on a Tesla K40
+  static constexpr pi_uint32 max_alignment = 0x200;
+
+  // enforce a valid pointer to the allocated memory
   assert(result_ptr != nullptr);
+  // check the the context is valid
   assert(context != nullptr);
+  // check that the device is valid
   assert(device != nullptr);
+  // check that the property list is empty
   assert(properties == nullptr);
+  // check that the alignment is not larger than max_alignment, and is either 0
+  // or a power of 2
+  assert(alignment <= max_alignment && (alignment & (alignment - 1)) == 0);
+
   pi_result result = PI_SUCCESS;
   try {
     ScopedContext active(context);
@@ -3427,7 +3450,9 @@ pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
   } catch (pi_result error) {
     result = error;
   }
-  assert(reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0);
+  // check that the result is suitable aligned
+  assert((alignment == 0) ||
+         (reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
   return result;
 }
 
@@ -3437,10 +3462,21 @@ pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
                                    pi_device device,
                                    pi_usm_mem_properties *properties,
                                    size_t size, pi_uint32 alignment) {
+  // from empirical testing with CUDA 10.2 on a Tesla K40
+  static constexpr pi_uint32 max_alignment = 0x200;
+
+  // enforce a valid pointer to the allocated memory
   assert(result_ptr != nullptr);
+  // check the the context is valid
   assert(context != nullptr);
+  // check that the device is valid
   assert(device != nullptr);
+  // check that the property list is empty
   assert(properties == nullptr);
+  // check that the alignment is not larger than max_alignment, and is either 0
+  // or a power of 2
+  assert(alignment <= max_alignment && (alignment & (alignment - 1)) == 0);
+
   pi_result result = PI_SUCCESS;
   try {
     ScopedContext active(context);
@@ -3449,7 +3485,9 @@ pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
   } catch (pi_result error) {
     result = error;
   }
-  assert(reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0);
+  // check that the result is suitable aligned
+  assert((alignment == 0) ||
+         (reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
   return result;
 }
 
@@ -3481,8 +3519,12 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
                                      pi_uint32 num_events_in_waitlist,
                                      const pi_event *events_waitlist,
                                      pi_event *event) {
+  // enforce that the queue is valid
   assert(queue != nullptr);
-  assert(ptr != nullptr);
+  // check that the pointer is valid
+  if (ptr == nullptr) {
+    return PI_INVALID_VALUE;
+  }
   CUstream cuStream = queue->get();
   pi_result result = PI_SUCCESS;
   std::unique_ptr<_pi_event> event_ptr{nullptr};
@@ -3514,9 +3556,12 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
                                      pi_uint32 num_events_in_waitlist,
                                      const pi_event *events_waitlist,
                                      pi_event *event) {
+  // enforce that the queue is valid
   assert(queue != nullptr);
-  assert(dst_ptr != nullptr);
-  assert(src_ptr != nullptr);
+  // check that the source and destination pointers are valid
+  if (dst_ptr == nullptr || src_ptr == nullptr) {
+    return PI_INVALID_VALUE;
+  }
   CUstream cuStream = queue->get();
   pi_result result = PI_SUCCESS;
   std::unique_ptr<_pi_event> event_ptr{nullptr};
@@ -3553,8 +3598,12 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
                                        pi_uint32 num_events_in_waitlist,
                                        const pi_event *events_waitlist,
                                        pi_event *event) {
+  // enforce that the queue is valid
   assert(queue != nullptr);
-  assert(ptr != nullptr);
+  // check that the pointer is valid
+  if (ptr == nullptr) {
+    return PI_INVALID_VALUE;
+  }
   CUstream cuStream = queue->get();
   pi_result result = PI_SUCCESS;
   std::unique_ptr<_pi_event> event_ptr{nullptr};
@@ -3589,8 +3638,12 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
 pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
                                         size_t length, int advice,
                                         pi_event *event) {
+  // enforce that the queue is valid
   assert(queue != nullptr);
-  assert(ptr != nullptr);
+  // check that the pointer is valid
+  if (ptr == nullptr) {
+    return PI_INVALID_VALUE;
+  }
   // TODO implement a mapping to cuMemAdvise once the expected behaviour
   // of piextUSMEnqueueMemAdvise is detailed in the USM extension
   return cuda_piEnqueueEventsWait(queue, 0, nullptr, event);
diff --git a/sycl/test/usm/allocator_vector.cpp b/sycl/test/usm/allocator_vector.cpp
index 2a87695c2f2ff..a2cec79fb793c 100644
--- a/sycl/test/usm/allocator_vector.cpp
+++ b/sycl/test/usm/allocator_vector.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/allocator_vector_fail.cpp b/sycl/test/usm/allocator_vector_fail.cpp
index 8c0e176eaa411..6a0276d72bf59 100644
--- a/sycl/test/usm/allocator_vector_fail.cpp
+++ b/sycl/test/usm/allocator_vector_fail.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/allocatorll.cpp b/sycl/test/usm/allocatorll.cpp
index 323dc0d75a1c1..7e633757e8369 100644
--- a/sycl/test/usm/allocatorll.cpp
+++ b/sycl/test/usm/allocatorll.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/badmalloc.cpp b/sycl/test/usm/badmalloc.cpp
index b139d7dbf80d8..ec99a6e475181 100644
--- a/sycl/test/usm/badmalloc.cpp
+++ b/sycl/test/usm/badmalloc.cpp
@@ -1,5 +1,4 @@
 // UNSUPPORTED: windows
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/depends_on.cpp b/sycl/test/usm/depends_on.cpp
index c985fc6c7a9a9..8716db434fed8 100644
--- a/sycl/test/usm/depends_on.cpp
+++ b/sycl/test/usm/depends_on.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/dmemll.cpp b/sycl/test/usm/dmemll.cpp
index 8617c9f751958..cf367385abd06 100644
--- a/sycl/test/usm/dmemll.cpp
+++ b/sycl/test/usm/dmemll.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/hmemll.cpp b/sycl/test/usm/hmemll.cpp
index 4ec1fd6a6516a..ac741b71ad372 100644
--- a/sycl/test/usm/hmemll.cpp
+++ b/sycl/test/usm/hmemll.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/math.cpp b/sycl/test/usm/math.cpp
index 22a8dc2b2e2bd..f0d0ba78f62e7 100644
--- a/sycl/test/usm/math.cpp
+++ b/sycl/test/usm/math.cpp
@@ -3,7 +3,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cpu
-// XFAIL: cuda
 // TODO: ptxas fatal   : Unresolved extern function '_Z20__spirv_ocl_lgamma_rfPi'
 
 #include <CL/sycl.hpp>
diff --git a/sycl/test/usm/memadvise.cpp b/sycl/test/usm/memadvise.cpp
index 8183f4a59c784..a50d9e52e0a81 100644
--- a/sycl/test/usm/memadvise.cpp
+++ b/sycl/test/usm/memadvise.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // SYCL runtime and piextUSM*Alloc functions for CUDA not behaving as described
 // in: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 //
diff --git a/sycl/test/usm/memcpy.cpp b/sycl/test/usm/memcpy.cpp
index 0b933d0f004aa..7643f0f0f3ba3 100644
--- a/sycl/test/usm/memcpy.cpp
+++ b/sycl/test/usm/memcpy.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/memset.cpp b/sycl/test/usm/memset.cpp
index 313fa4cbda591..80a10b6b84c1f 100644
--- a/sycl/test/usm/memset.cpp
+++ b/sycl/test/usm/memset.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/mixed.cpp b/sycl/test/usm/mixed.cpp
index 092a1e51d4b4f..ed1e7b6d46013 100644
--- a/sycl/test/usm/mixed.cpp
+++ b/sycl/test/usm/mixed.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/mixed2.cpp b/sycl/test/usm/mixed2.cpp
index 7e8ef785c42cb..278025a98c78a 100644
--- a/sycl/test/usm/mixed2.cpp
+++ b/sycl/test/usm/mixed2.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/mixed2template.cpp b/sycl/test/usm/mixed2template.cpp
index 24acd20396e98..24817129d8957 100644
--- a/sycl/test/usm/mixed2template.cpp
+++ b/sycl/test/usm/mixed2template.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/mixed_queue.cpp b/sycl/test/usm/mixed_queue.cpp
index 1c99ebda7b5ce..0a1fe439ff2da 100644
--- a/sycl/test/usm/mixed_queue.cpp
+++ b/sycl/test/usm/mixed_queue.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/queue_wait.cpp b/sycl/test/usm/queue_wait.cpp
index dfb2d9414fc21..6afc584b844ae 100644
--- a/sycl/test/usm/queue_wait.cpp
+++ b/sycl/test/usm/queue_wait.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
diff --git a/sycl/test/usm/smemll.cpp b/sycl/test/usm/smemll.cpp
index 46d1f10a5f33f..f919f136fd33d 100644
--- a/sycl/test/usm/smemll.cpp
+++ b/sycl/test/usm/smemll.cpp
@@ -1,4 +1,3 @@
-// XFAIL: cuda
 // piextUSM*Alloc functions for CUDA are not behaving as described in
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
 // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc