diff --git a/buildbot/testlist.cfg b/buildbot/testlist.cfg index e60fea347dc99..01aa2bd10c151 100644 --- a/buildbot/testlist.cfg +++ b/buildbot/testlist.cfg @@ -11,7 +11,7 @@ :test_exception_handling :test_group :test_h_item -:test_handler +#:test_handler :test_header :test_hierarchical :test_id @@ -20,7 +20,7 @@ :test_item :test_kernel :test_kernel_args -:test_math_builtin_api +#:test_math_builtin_api :test_multi_ptr :test_nd_item :test_nd_range @@ -38,8 +38,7 @@ :test_vector_api :test_vector_constructors :test_vector_load_store -# Disable test to speedup testing until JIT is optimized #:test_vector_operators :test_vector_swizzle_assignment -:test_vector_swizzles -:test_vector_swizzles_opencl +#:test_vector_swizzles +#:test_vector_swizzles_opencl diff --git a/sycl/include/CL/sycl/backend/cuda.hpp b/sycl/include/CL/sycl/backend/cuda.hpp index a0dfae334497f..aff3e96d54e4a 100644 --- a/sycl/include/CL/sycl/backend/cuda.hpp +++ b/sycl/include/CL/sycl/backend/cuda.hpp @@ -18,7 +18,9 @@ namespace cuda { // Mem Object info: Retrieve the raw CUDA pointer from a cl_mem #define PI_CUDA_RAW_POINTER (0xFF01) -// Context creation: Use the primary context instead of a custom one +// Context creation: Use a primary CUDA context instead of a custom one by +// providing a property value of PI_TRUE for the following +// property ID. #define PI_CONTEXT_PROPERTIES_CUDA_PRIMARY (0xFF02) // PI Command Queue using Default stream diff --git a/sycl/include/CL/sycl/context.hpp b/sycl/include/CL/sycl/context.hpp index 926bb22aebfdd..f640bd2f53e23 100644 --- a/sycl/include/CL/sycl/context.hpp +++ b/sycl/include/CL/sycl/context.hpp @@ -49,8 +49,8 @@ class context { /// @param AsyncHandler is an instance of async_handler. /// @param UseCUDAPrimaryContext is a bool determining whether to use the /// primary context in the CUDA backend. - context(const device &Device, async_handler AsyncHandler = {}, - bool UseCUDAPrimaryContext = false); + explicit context(const device &Device, async_handler AsyncHandler = {}, + bool UseCUDAPrimaryContext = false); /// Constructs a SYCL context instance using the provided platform. /// @@ -63,8 +63,8 @@ class context { /// @param AsyncHandler is an instance of async_handler. /// @param UseCUDAPrimaryContext is a bool determining whether to use the /// primary context in the CUDA backend. - context(const platform &Platform, async_handler AsyncHandler = {}, - bool UseCUDAPrimaryContext = false); + explicit context(const platform &Platform, async_handler AsyncHandler = {}, + bool UseCUDAPrimaryContext = false); /// Constructs a SYCL context instance using list of devices. /// @@ -78,8 +78,9 @@ class context { /// @param AsyncHandler is an instance of async_handler. /// @param UseCUDAPrimaryContext is a bool determining whether to use the /// primary context in the CUDA backend. - context(const vector_class &DeviceList, - async_handler AsyncHandler = {}, bool UseCUDAPrimaryContext = false); + explicit context(const vector_class &DeviceList, + async_handler AsyncHandler = {}, + bool UseCUDAPrimaryContext = false); /// Constructs a SYCL context instance from OpenCL cl_context. /// diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 686fdc49f753a..8e408ac45c065 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -31,19 +31,19 @@ // elsewhere, e.g. in the pi_opencl, but constants/enums mapping is now // done here, for efficiency and simplicity. // -#include #include +#include #include #ifdef __cplusplus extern "C" { #endif // __cplusplus -typedef int32_t pi_int32; -typedef uint32_t pi_uint32; -typedef uint64_t pi_uint64; -typedef pi_uint32 pi_bool; -typedef pi_uint64 pi_bitfield; +typedef int32_t pi_int32; +typedef uint32_t pi_uint32; +typedef uint64_t pi_uint64; +typedef pi_uint32 pi_bool; +typedef pi_uint64 pi_bitfield; // // NOTE: prefer to map 1:1 to OpenCL so that no translation is needed @@ -73,7 +73,7 @@ typedef enum { PI_MISALIGNED_SUB_BUFFER_OFFSET = CL_MISALIGNED_SUB_BUFFER_OFFSET, PI_BUILD_PROGRAM_FAILURE = CL_BUILD_PROGRAM_FAILURE, PI_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE, - PI_ERROR_UNKNOWN = -999 + PI_ERROR_UNKNOWN = -999 } _pi_result; typedef enum { @@ -83,16 +83,6 @@ typedef enum { PI_EVENT_QUEUED = CL_QUEUED } _pi_event_status; -typedef enum { - PI_COMMAND_KERNEL_LAUNCH = CL_COMMAND_NDRANGE_KERNEL, - PI_COMMAND_MEMBUFFER_WRITE = CL_COMMAND_WRITE_BUFFER, - PI_COMMAND_MEMBUFFER_READ = CL_COMMAND_READ_BUFFER, - PI_COMMAND_USER = CL_COMMAND_USER, - PI_COMMAND_EVENTS_WAIT = CL_COMMAND_MARKER, - PI_COMMAND_MEMBUFFER_COPY = CL_COMMAND_COPY_BUFFER, - PI_COMMAND_MEMBUFFER_FILL = CL_COMMAND_FILL_BUFFER -} _pi_command_type; - typedef enum { PI_PLATFORM_INFO_EXTENSIONS = CL_PLATFORM_EXTENSIONS, PI_PLATFORM_INFO_NAME = CL_PLATFORM_NAME, @@ -101,17 +91,6 @@ typedef enum { PI_PLATFORM_INFO_VERSION = CL_PLATFORM_VERSION, } _pi_platform_info; -typedef enum { - PI_PROGRAM_INFO_REFERENCE_COUNT = CL_PROGRAM_REFERENCE_COUNT, - PI_PROGRAM_INFO_CONTEXT = CL_PROGRAM_CONTEXT, - PI_PROGRAM_INFO_NUM_DEVICES = CL_PROGRAM_NUM_DEVICES, - PI_PROGRAM_INFO_DEVICES = CL_PROGRAM_DEVICES, - PI_PROGRAM_INFO_SOURCE = CL_PROGRAM_SOURCE, - PI_PROGRAM_INFO_BINARY_SIZES = CL_PROGRAM_BINARY_SIZES, - PI_PROGRAM_INFO_BINARIES = CL_PROGRAM_BINARIES, - PI_PROGRAM_INFO_KERNEL_NAMES = CL_PROGRAM_KERNEL_NAMES -} _pi_program_info; - typedef enum { PI_PROGRAM_BUILD_INFO_STATUS = CL_PROGRAM_BUILD_STATUS, PI_PROGRAM_BUILD_INFO_OPTIONS = CL_PROGRAM_BUILD_OPTIONS, @@ -134,6 +113,17 @@ typedef enum : pi_uint64 { PI_DEVICE_TYPE_ACC = CL_DEVICE_TYPE_ACCELERATOR } _pi_device_type; +typedef enum { + PI_DEVICE_MEM_CACHE_TYPE_NONE = CL_NONE, + PI_DEVICE_MEM_CACHE_TYPE_READ_ONLY_CACHE = CL_READ_ONLY_CACHE, + PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE = CL_READ_WRITE_CACHE +} _pi_device_mem_cache_type; + +typedef enum { + PI_DEVICE_LOCAL_MEM_TYPE_LOCAL = CL_LOCAL, + PI_DEVICE_LOCAL_MEM_TYPE_GLOBAL = CL_GLOBAL +} _pi_device_local_mem_type; + typedef enum { PI_DEVICE_INFO_TYPE = CL_DEVICE_TYPE, PI_DEVICE_INFO_VENDOR_ID = CL_DEVICE_VENDOR_ID, @@ -141,57 +131,82 @@ typedef enum { PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES = CL_DEVICE_MAX_WORK_ITEM_SIZES, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE = CL_DEVICE_MAX_WORK_GROUP_SIZE, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, - PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, - PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, + PI_DEVICE_INFO_SINGLE_FP_CONFIG = CL_DEVICE_SINGLE_FP_CONFIG, + PI_DEVICE_INFO_HALF_FP_CONFIG = CL_DEVICE_HALF_FP_CONFIG, + PI_DEVICE_INFO_DOUBLE_FP_CONFIG = CL_DEVICE_DOUBLE_FP_CONFIG, + PI_DEVICE_INFO_QUEUE_PROPERTIES = CL_DEVICE_QUEUE_PROPERTIES, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, + PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = + CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = + CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = + CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = + CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = + CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = + CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = + CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, + PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = + CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY = CL_DEVICE_MAX_CLOCK_FREQUENCY, PI_DEVICE_INFO_ADDRESS_BITS = CL_DEVICE_ADDRESS_BITS, PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = CL_DEVICE_MAX_MEM_ALLOC_SIZE, PI_DEVICE_INFO_IMAGE_SUPPORT = CL_DEVICE_IMAGE_SUPPORT, PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS = CL_DEVICE_MAX_READ_IMAGE_ARGS, PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = CL_DEVICE_MAX_WRITE_IMAGE_ARGS, - PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = CL_DEVICE_IMAGE2D_MAX_HEIGHT, PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH = CL_DEVICE_IMAGE2D_MAX_WIDTH, - PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = CL_DEVICE_IMAGE3D_MAX_HEIGHT, + PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = CL_DEVICE_IMAGE2D_MAX_HEIGHT, PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH = CL_DEVICE_IMAGE3D_MAX_WIDTH, + PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = CL_DEVICE_IMAGE3D_MAX_HEIGHT, PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH = CL_DEVICE_IMAGE3D_MAX_DEPTH, - PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, + PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = + CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, PI_DEVICE_INFO_MAX_SAMPLERS = CL_DEVICE_MAX_SAMPLERS, PI_DEVICE_INFO_MAX_PARAMETER_SIZE = CL_DEVICE_MAX_PARAMETER_SIZE, PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = CL_DEVICE_MEM_BASE_ADDR_ALIGN, - PI_DEVICE_INFO_HALF_FP_CONFIG = CL_DEVICE_HALF_FP_CONFIG, - PI_DEVICE_INFO_SINGLE_FP_CONFIG = CL_DEVICE_SINGLE_FP_CONFIG, - PI_DEVICE_INFO_DOUBLE_FP_CONFIG = CL_DEVICE_DOUBLE_FP_CONFIG, - PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, - PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, - PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, + PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = + CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, + PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = + CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, + PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = + CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, PI_DEVICE_INFO_GLOBAL_MEM_SIZE = CL_DEVICE_GLOBAL_MEM_SIZE, - PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = + CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, PI_DEVICE_INFO_MAX_CONSTANT_ARGS = CL_DEVICE_MAX_CONSTANT_ARGS, PI_DEVICE_INFO_LOCAL_MEM_TYPE = CL_DEVICE_LOCAL_MEM_TYPE, PI_DEVICE_INFO_LOCAL_MEM_SIZE = CL_DEVICE_LOCAL_MEM_SIZE, - PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = CL_DEVICE_ERROR_CORRECTION_SUPPORT, + PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = + CL_DEVICE_ERROR_CORRECTION_SUPPORT, PI_DEVICE_INFO_HOST_UNIFIED_MEMORY = CL_DEVICE_HOST_UNIFIED_MEMORY, - PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = CL_DEVICE_PROFILING_TIMER_RESOLUTION, - PI_DEVICE_INFO_IS_ENDIAN_LITTLE = CL_DEVICE_ENDIAN_LITTLE, - PI_DEVICE_INFO_IS_AVAILABLE = CL_DEVICE_AVAILABLE, - PI_DEVICE_INFO_IS_COMPILER_AVAILABLE = CL_DEVICE_COMPILER_AVAILABLE, - PI_DEVICE_INFO_IS_LINKER_AVAILABLE = CL_DEVICE_LINKER_AVAILABLE, - PI_DEVICE_INFO_EXECUTION_CAPABILITIES = CL_DEVICE_EXECUTION_CAPABILITIES, - PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, - PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, + PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = + CL_DEVICE_PROFILING_TIMER_RESOLUTION, + PI_DEVICE_INFO_ENDIAN_LITTLE = CL_DEVICE_ENDIAN_LITTLE, + PI_DEVICE_INFO_AVAILABLE = CL_DEVICE_AVAILABLE, + PI_DEVICE_INFO_COMPILER_AVAILABLE = CL_DEVICE_COMPILER_AVAILABLE, + PI_DEVICE_INFO_LINKER_AVAILABLE = CL_DEVICE_LINKER_AVAILABLE, + PI_DEVICE_INFO_EXECUTION_CAPABILITIES = + CL_DEVICE_EXECUTION_CAPABILITIES, + PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = + CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, + PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = + CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, PI_DEVICE_INFO_BUILT_IN_KERNELS = CL_DEVICE_BUILT_IN_KERNELS, PI_DEVICE_INFO_PLATFORM = CL_DEVICE_PLATFORM, PI_DEVICE_INFO_REFERENCE_COUNT = CL_DEVICE_REFERENCE_COUNT, @@ -203,48 +218,57 @@ typedef enum { PI_DEVICE_INFO_OPENCL_C_VERSION = CL_DEVICE_OPENCL_C_VERSION, PI_DEVICE_INFO_EXTENSIONS = CL_DEVICE_EXTENSIONS, PI_DEVICE_INFO_PRINTF_BUFFER_SIZE = CL_DEVICE_PRINTF_BUFFER_SIZE, - PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, + PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = + CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, PI_DEVICE_INFO_PARENT_DEVICE = CL_DEVICE_PARENT_DEVICE, - PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = CL_DEVICE_PARTITION_MAX_SUB_DEVICES, PI_DEVICE_INFO_PARTITION_PROPERTIES = CL_DEVICE_PARTITION_PROPERTIES, - PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = CL_DEVICE_PARTITION_AFFINITY_DOMAIN, + PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = + CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = + CL_DEVICE_PARTITION_AFFINITY_DOMAIN, PI_DEVICE_INFO_PARTITION_TYPE = CL_DEVICE_PARTITION_TYPE, + PI_DEVICE_INFO_USM_HOST_SUPPORT = + CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, + PI_DEVICE_INFO_USM_DEVICE_SUPPORT = + CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL, + PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = + CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, + PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = + CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, + PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = + CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL } _pi_device_info; typedef enum { - PI_LOCAL_MEM_TYPE_LOCAL = CL_LOCAL, - PI_LOCAL_MEM_TYPE_GLOBAL = CL_GLOBAL -} _pi_local_mem_type; + PI_PROGRAM_INFO_REFERENCE_COUNT = CL_PROGRAM_REFERENCE_COUNT, + PI_PROGRAM_INFO_CONTEXT = CL_PROGRAM_CONTEXT, + PI_PROGRAM_INFO_NUM_DEVICES = CL_PROGRAM_NUM_DEVICES, + PI_PROGRAM_INFO_DEVICES = CL_PROGRAM_DEVICES, + PI_PROGRAM_INFO_SOURCE = CL_PROGRAM_SOURCE, + PI_PROGRAM_INFO_BINARY_SIZES = CL_PROGRAM_BINARY_SIZES, + PI_PROGRAM_INFO_BINARIES = CL_PROGRAM_BINARIES, + PI_PROGRAM_INFO_NUM_KERNELS = CL_PROGRAM_NUM_KERNELS, + PI_PROGRAM_INFO_KERNEL_NAMES = CL_PROGRAM_KERNEL_NAMES +} _pi_program_info; + +typedef intptr_t pi_context_properties; -// TODO: populate typedef enum { PI_CONTEXT_INFO_DEVICES = CL_CONTEXT_DEVICES, - PI_CONTEXT_INFO_NUM_DEVICES = CL_CONTEXT_NUM_DEVICES, - PI_CONTEXT_INFO_REFERENCE_COUNT = CL_CONTEXT_REFERENCE_COUNT + PI_CONTEXT_INFO_NUM_DEVICES = CL_CONTEXT_NUM_DEVICES, + PI_CONTEXT_INFO_PROPERTIES = CL_CONTEXT_PROPERTIES, + PI_CONTEXT_INFO_REFERENCE_COUNT = CL_CONTEXT_REFERENCE_COUNT, } _pi_context_info; -// TODO: populate typedef enum { + PI_QUEUE_INFO_CONTEXT = CL_QUEUE_CONTEXT, PI_QUEUE_INFO_DEVICE = CL_QUEUE_DEVICE, - PI_QUEUE_INFO_REFERENCE_COUNT = CL_QUEUE_REFERENCE_COUNT, + PI_QUEUE_INFO_DEVICE_DEFAULT = CL_QUEUE_DEVICE_DEFAULT, PI_QUEUE_INFO_PROPERTIES = CL_QUEUE_PROPERTIES, - PI_QUEUE_INFO_CONTEXT = CL_QUEUE_CONTEXT + PI_QUEUE_INFO_REFERENCE_COUNT = CL_QUEUE_REFERENCE_COUNT, + PI_QUEUE_INFO_SIZE = CL_QUEUE_SIZE } _pi_queue_info; -typedef enum { - PI_KERNEL_INFO_FUNCTION_NAME = CL_KERNEL_FUNCTION_NAME, - PI_KERNEL_INFO_NUM_ARGS = CL_KERNEL_NUM_ARGS, - PI_KERNEL_INFO_REFERENCE_COUNT = CL_KERNEL_REFERENCE_COUNT, - PI_KERNEL_INFO_CONTEXT = CL_KERNEL_CONTEXT, - PI_KERNEL_INFO_PROGRAM = CL_KERNEL_PROGRAM -} _pi_kernel_info; - -typedef enum { - PI_KERNEL_GROUP_INFO_SIZE = CL_KERNEL_WORK_GROUP_SIZE, - PI_KERNEL_COMPILE_GROUP_INFO_SIZE = CL_KERNEL_COMPILE_WORK_GROUP_SIZE, - PI_KERNEL_LOCAL_MEM_SIZE = CL_KERNEL_LOCAL_MEM_SIZE -} _pi_kernel_group_info; - typedef enum { PI_IMAGE_INFO_FORMAT = CL_IMAGE_FORMAT, PI_IMAGE_INFO_ELEMENT_SIZE = CL_IMAGE_ELEMENT_SIZE, @@ -255,6 +279,64 @@ typedef enum { PI_IMAGE_INFO_DEPTH = CL_IMAGE_DEPTH } _pi_image_info; +typedef enum { + PI_KERNEL_INFO_FUNCTION_NAME = CL_KERNEL_FUNCTION_NAME, + PI_KERNEL_INFO_NUM_ARGS = CL_KERNEL_NUM_ARGS, + PI_KERNEL_INFO_REFERENCE_COUNT = CL_KERNEL_REFERENCE_COUNT, + PI_KERNEL_INFO_CONTEXT = CL_KERNEL_CONTEXT, + PI_KERNEL_INFO_PROGRAM = CL_KERNEL_PROGRAM, + PI_KERNEL_INFO_ATTRIBUTES = CL_KERNEL_ATTRIBUTES +} _pi_kernel_info; + +typedef enum { + PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = CL_KERNEL_GLOBAL_WORK_SIZE, + PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE = CL_KERNEL_WORK_GROUP_SIZE, + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE = CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE = CL_KERNEL_LOCAL_MEM_SIZE, + PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = CL_KERNEL_PRIVATE_MEM_SIZE +} _pi_kernel_group_info; + +typedef enum { + PI_EVENT_INFO_COMMAND_QUEUE = CL_EVENT_COMMAND_QUEUE, + PI_EVENT_INFO_CONTEXT = CL_EVENT_CONTEXT, + PI_EVENT_INFO_COMMAND_TYPE = CL_EVENT_COMMAND_TYPE, + PI_EVENT_INFO_COMMAND_EXECUTION_STATUS = CL_EVENT_COMMAND_EXECUTION_STATUS, + PI_EVENT_INFO_REFERENCE_COUNT = CL_EVENT_REFERENCE_COUNT +} _pi_event_info; + +typedef enum { + PI_COMMAND_TYPE_NDRANGE_KERNEL = CL_COMMAND_NDRANGE_KERNEL, + PI_COMMAND_TYPE_MEM_BUFFER_READ = CL_COMMAND_READ_BUFFER, + PI_COMMAND_TYPE_MEM_BUFFER_WRITE = CL_COMMAND_WRITE_BUFFER, + PI_COMMAND_TYPE_MEM_BUFFER_COPY = CL_COMMAND_COPY_BUFFER, + PI_COMMAND_TYPE_MEM_BUFFER_MAP = CL_COMMAND_MAP_BUFFER, + PI_COMMAND_TYPE_MEM_BUFFER_UNMAP = CL_COMMAND_UNMAP_MEM_OBJECT, + PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT = CL_COMMAND_READ_BUFFER_RECT, + PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT = CL_COMMAND_WRITE_BUFFER_RECT, + PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT = CL_COMMAND_COPY_BUFFER_RECT, + PI_COMMAND_TYPE_USER = CL_COMMAND_USER, + PI_COMMAND_TYPE_MEM_BUFFER_FILL = CL_COMMAND_FILL_BUFFER, + PI_COMMAND_TYPE_IMAGE_READ = CL_COMMAND_READ_IMAGE, + PI_COMMAND_TYPE_IMAGE_WRITE = CL_COMMAND_WRITE_IMAGE, + PI_COMMAND_TYPE_IMAGE_COPY = CL_COMMAND_COPY_IMAGE, + PI_COMMAND_TYPE_NATIVE_KERNEL = CL_COMMAND_NATIVE_KERNEL, + PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE = CL_COMMAND_COPY_BUFFER_TO_IMAGE, + PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER = CL_COMMAND_COPY_IMAGE_TO_BUFFER, + PI_COMMAND_TYPE_MAP_IMAGE = CL_COMMAND_MAP_IMAGE, + PI_COMMAND_TYPE_MARKER = CL_COMMAND_MARKER, + PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS = CL_COMMAND_ACQUIRE_GL_OBJECTS, + PI_COMMAND_TYPE_RELEASE_GL_OBJECTS = CL_COMMAND_RELEASE_GL_OBJECTS, + PI_COMMAND_TYPE_BARRIER = CL_COMMAND_BARRIER, + PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS = CL_COMMAND_MIGRATE_MEM_OBJECTS, + PI_COMMAND_TYPE_FILL_IMAGE = CL_COMMAND_FILL_IMAGE, + PI_COMMAND_TYPE_SVM_FREE = CL_COMMAND_SVM_FREE, + PI_COMMAND_TYPE_SVM_MEMCPY = CL_COMMAND_SVM_MEMCPY, + PI_COMMAND_TYPE_SVM_MEMFILL = CL_COMMAND_SVM_MEMFILL, + PI_COMMAND_TYPE_SVM_MAP = CL_COMMAND_SVM_MAP, + PI_COMMAND_TYPE_SVM_UNMAP = CL_COMMAND_SVM_UNMAP +} _pi_command_type; + typedef enum { PI_MEM_TYPE_BUFFER = CL_MEM_OBJECT_BUFFER, PI_MEM_TYPE_IMAGE2D = CL_MEM_OBJECT_IMAGE2D, @@ -265,6 +347,19 @@ typedef enum { PI_MEM_TYPE_IMAGE1D_BUFFER = CL_MEM_OBJECT_IMAGE1D_BUFFER } _pi_mem_type; +typedef enum { + PI_MEM_ADVICE_SET_READ_MOSTLY = 0, // hints that memory will be read from frequently and written to rarely + PI_MEM_ADVICE_CLEAR_READ_MOSTLY, // removes the affect of PI_MEM_ADVICE_SET_READ_MOSTLY + PI_MEM_ADVICE_SET_PREFERRED_LOCATION, // hints that the preferred memory location is the specified device + PI_MEM_ADVICE_CLEAR_PREFERRED_LOCATION, // removes the affect of PI_MEM_ADVICE_SET_PREFERRED_LOCATION + PI_MEM_ADVICE_SET_ACCESSED_BY, // hints that memory will be accessed by the specified device + PI_MEM_ADVICE_CLEAR_ACCESSED_BY, // removes the affect of PI_MEM_ADVICE_SET_ACCESSED_BY + PI_MEM_ADVICE_SET_NON_ATOMIC_MOSTLY, // hints that memory will mostly be accessed non-atomically + PI_MEM_ADVICE_CLEAR_NON_ATOMIC_MOSTLY, // removes the affect of PI_MEM_ADVICE_SET_NON_ATOMIC_MOSTLY + PI_MEM_ADVICE_BIAS_CACHED, // hints that memory should be cached + PI_MEM_ADVICE_BIAS_UNCACHED // hints that memory should not be cached +} _pi_mem_advice; + typedef enum { PI_IMAGE_CHANNEL_ORDER_A = CL_A, PI_IMAGE_CHANNEL_ORDER_R = CL_R, @@ -304,19 +399,18 @@ typedef enum { PI_BUFFER_CREATE_TYPE_REGION = CL_BUFFER_CREATE_TYPE_REGION } _pi_buffer_create_type; -typedef pi_bitfield pi_sampler_properties; const pi_bool PI_TRUE = CL_TRUE; const pi_bool PI_FALSE = CL_FALSE; typedef enum { - PI_SAMPLER_INFO_REFERENCE_COUNT = CL_SAMPLER_REFERENCE_COUNT, - PI_SAMPLER_INFO_CONTEXT = CL_SAMPLER_CONTEXT, - PI_SAMPLER_INFO_NORMALIZED_COORDS = CL_SAMPLER_NORMALIZED_COORDS, - PI_SAMPLER_INFO_ADDRESSING_MODE = CL_SAMPLER_ADDRESSING_MODE, - PI_SAMPLER_INFO_FILTER_MODE = CL_SAMPLER_FILTER_MODE, - PI_SAMPLER_INFO_MIP_FILTER_MODE = CL_SAMPLER_MIP_FILTER_MODE, - PI_SAMPLER_INFO_LOD_MIN = CL_SAMPLER_LOD_MIN, - PI_SAMPLER_INFO_LOD_MAX = CL_SAMPLER_LOD_MAX + PI_SAMPLER_INFO_REFERENCE_COUNT = CL_SAMPLER_REFERENCE_COUNT, + PI_SAMPLER_INFO_CONTEXT = CL_SAMPLER_CONTEXT, + PI_SAMPLER_INFO_NORMALIZED_COORDS = CL_SAMPLER_NORMALIZED_COORDS, + PI_SAMPLER_INFO_ADDRESSING_MODE = CL_SAMPLER_ADDRESSING_MODE, + PI_SAMPLER_INFO_FILTER_MODE = CL_SAMPLER_FILTER_MODE, + PI_SAMPLER_INFO_MIP_FILTER_MODE = CL_SAMPLER_MIP_FILTER_MODE, + PI_SAMPLER_INFO_LOD_MIN = CL_SAMPLER_LOD_MIN, + PI_SAMPLER_INFO_LOD_MAX = CL_SAMPLER_LOD_MAX } _pi_sampler_info; typedef enum { @@ -332,14 +426,14 @@ typedef enum { PI_SAMPLER_FILTER_MODE_LINEAR = CL_FILTER_LINEAR, } _pi_sampler_filter_mode; -typedef enum { - PI_EVENT_INFO_QUEUE = CL_EVENT_COMMAND_QUEUE, - PI_EVENT_INFO_COMMAND_TYPE = CL_EVENT_COMMAND_TYPE, - PI_EVENT_INFO_REFERENCE_COUNT = CL_EVENT_REFERENCE_COUNT, - PI_EVENT_INFO_COMMAND_EXECUTION_STATUS = CL_EVENT_COMMAND_EXECUTION_STATUS, - PI_EVENT_INFO_CONTEXT = CL_EVENT_CONTEXT -} _pi_event_info; +using pi_device_exec_capabilities pi_bitfield; +constexpr pi_device_exec_capabilities PI_DEVICE_EXEC_CAPABILITIES_KERNEL = CL_EXEC_KERNEL; +constexpr pi_device_exec_capabilities PI_DEVICE_EXEC_CAPABILITIES_NATIVE_KERNEL = CL_EXEC_NATIVE_KERNEL; +using pi_sampler_properties pi_bitfield; +constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS = CL_SAMPLER_NORMALIZED_COORDS; +constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE = CL_SAMPLER_ADDRESSING_MODE; +constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_FILTER_MODE = CL_SAMPLER_FILTER_MODE; // NOTE: this is made 64-bit to match the size of cl_mem_flags to // make the translation to OpenCL transparent. @@ -347,29 +441,37 @@ typedef enum { // typedef pi_bitfield pi_mem_flags; // Access -const pi_mem_flags PI_MEM_FLAGS_ACCESS_RW = CL_MEM_READ_WRITE; +const pi_mem_flags PI_MEM_FLAGS_ACCESS_RW = CL_MEM_READ_WRITE; // Host pointer -const pi_mem_flags PI_MEM_FLAGS_HOST_PTR_USE = CL_MEM_USE_HOST_PTR; +const pi_mem_flags PI_MEM_FLAGS_HOST_PTR_USE = CL_MEM_USE_HOST_PTR; const pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY = CL_MEM_COPY_HOST_PTR; // NOTE: queue properties are implemented this way to better support bit // manipulations typedef pi_bitfield pi_queue_properties; const pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; const pi_queue_properties PI_QUEUE_PROFILING_ENABLE = CL_QUEUE_PROFILING_ENABLE; const pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE; const pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = - CL_QUEUE_ON_DEVICE_DEFAULT; + CL_QUEUE_ON_DEVICE_DEFAULT; typedef _pi_result pi_result; typedef _pi_platform_info pi_platform_info; typedef _pi_device_type pi_device_type; +typedef _pi_device_mem_cache_type pi_device_mem_cache_type; +typedef _pi_device_local_mem_type pi_device_local_mem_type; typedef _pi_device_info pi_device_info; +typedef _pi_program_info pi_program_info; typedef _pi_context_info pi_context_info; typedef _pi_queue_info pi_queue_info; typedef _pi_image_info pi_image_info; +typedef _pi_kernel_info pi_kernel_info; +typedef _pi_kernel_group_info pi_kernel_group_info; +typedef _pi_event_info pi_event_info; +typedef _pi_command_type pi_command_type; typedef _pi_mem_type pi_mem_type; +typedef _pi_mem_advice pi_mem_advice; typedef _pi_image_channel_order pi_image_channel_order; typedef _pi_image_channel_type pi_image_channel_type; typedef _pi_buffer_create_type pi_buffer_create_type; @@ -377,13 +479,23 @@ typedef _pi_sampler_addressing_mode pi_sampler_addressing_mode; typedef _pi_sampler_filter_mode pi_sampler_filter_mode; typedef _pi_sampler_info pi_sampler_info; typedef _pi_event_status pi_event_status; -typedef _pi_event_info pi_event_info; -typedef _pi_command_type pi_command_type; -typedef _pi_program_info pi_program_info; typedef _pi_program_build_info pi_program_build_info; typedef _pi_program_build_status pi_program_build_status; typedef _pi_kernel_info pi_kernel_info; -typedef _pi_kernel_group_info pi_kernel_group_info; + +// For compatibility with OpenCL define this not as enum. +using pi_device_partition_property intptr_t; +static constexpr pi_device_partition_property + PI_DEVICE_PARTITION_EQUALLY = CL_DEVICE_PARTITION_EQUALLY; +static constexpr pi_device_partition_property + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; + +// For compatibility with OpenCL define this not as enum. +using pi_device_affinity_domain pi_bitfield; +static constexpr pi_device_affinity_domain + PI_DEVICE_AFFINITY_DOMAIN_NUMA = CL_DEVICE_AFFINITY_DOMAIN_NUMA; +static constexpr pi_device_affinity_domain + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE = CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE; // Entry type, matches OpenMP for compatibility struct _pi_offload_entry_struct { @@ -394,17 +506,17 @@ struct _pi_offload_entry_struct { int32_t reserved; }; -typedef _pi_offload_entry_struct * _pi_offload_entry; +typedef _pi_offload_entry_struct *_pi_offload_entry; /// Types of device binary. typedef uint8_t pi_device_binary_type; // format is not determined -static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_NONE = 0; +static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_NONE = 0; // specific to a device -static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_NATIVE = 1; +static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_NATIVE = 1; // portable binary types go next // SPIR-V -static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_SPIRV = 2; +static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_SPIRV = 2; // LLVM bitcode static const pi_device_binary_type PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE = 3; @@ -433,10 +545,10 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; #define PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA "spir64_fpga" /// This struct is a record of the device binary information. If the Kind field -/// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec field -/// can still be specific and denote e.g. FPGA target. -/// It must match the __tgt_device_image structure generated by -/// the clang-offload-wrapper tool when their Version field match. +/// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec +/// field can still be specific and denote e.g. FPGA target. It must match the +/// __tgt_device_image structure generated by the clang-offload-wrapper tool +/// when their Version field match. struct pi_device_binary_struct { /// version of this structure - for backward compatibility; /// all modifications which change order/type/offsets of existing fields @@ -473,7 +585,7 @@ struct pi_device_binary_struct { _pi_offload_entry EntriesBegin; _pi_offload_entry EntriesEnd; }; -typedef pi_device_binary_struct * pi_device_binary; +typedef pi_device_binary_struct *pi_device_binary; // Offload binaries descriptor version supported by this library. static const uint16_t PI_DEVICE_BINARIES_VERSION = 1; @@ -494,7 +606,7 @@ struct pi_device_binaries_struct { _pi_offload_entry *HostEntriesBegin; _pi_offload_entry *HostEntriesEnd; }; -typedef pi_device_binaries_struct * pi_device_binaries; +typedef pi_device_binaries_struct *pi_device_binaries; // Opaque types that make reading build log errors easier. struct _pi_platform; @@ -507,15 +619,15 @@ struct _pi_kernel; struct _pi_event; struct _pi_sampler; -typedef _pi_platform * pi_platform; -typedef _pi_device * pi_device; -typedef _pi_context * pi_context; -typedef _pi_queue * pi_queue; -typedef _pi_mem * pi_mem; -typedef _pi_program * pi_program; -typedef _pi_kernel * pi_kernel; -typedef _pi_event * pi_event; -typedef _pi_sampler * pi_sampler; +typedef _pi_platform *pi_platform; +typedef _pi_device *pi_device; +typedef _pi_context *pi_context; +typedef _pi_queue *pi_queue; +typedef _pi_mem *pi_mem; +typedef _pi_program *pi_program; +typedef _pi_kernel *pi_kernel; +typedef _pi_event *pi_event; +typedef _pi_sampler *pi_sampler; typedef struct { pi_image_channel_order image_channel_order; @@ -535,8 +647,8 @@ typedef struct { pi_mem buffer; } _pi_image_desc; -typedef _pi_image_format pi_image_format; -typedef _pi_image_desc pi_image_desc; +typedef _pi_image_format pi_image_format; +typedef _pi_image_desc pi_image_desc; // // Following section contains SYCL RT Plugin Interface (PI) functions. // They are 3 distinct categories: @@ -563,54 +675,39 @@ pi_result piPluginInit(pi_plugin *plugin_info); // // Platform // -pi_result piPlatformsGet( - pi_uint32 num_entries, - pi_platform * platforms, - pi_uint32 * num_platforms); - -pi_result piPlatformGetInfo( - pi_platform platform, - pi_platform_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, + pi_uint32 *num_platforms); + +pi_result piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); // // Device // -pi_result piDevicesGet( - pi_platform platform, - pi_device_type device_type, - pi_uint32 num_entries, - pi_device * devices, - pi_uint32 * num_devices); - -pi_result piDeviceGetInfo( - pi_device device, - pi_device_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, + pi_uint32 num_entries, pi_device *devices, + pi_uint32 *num_devices); + +pi_result piDeviceGetInfo(pi_device device, pi_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); pi_result piDeviceRetain(pi_device device); pi_result piDeviceRelease(pi_device device); pi_result piDevicePartition( - pi_device device, - const cl_device_partition_property * properties, // TODO: untie from OpenCL - pi_uint32 num_devices, - pi_device * out_devices, - pi_uint32 * out_num_devices); + pi_device device, + const pi_device_partition_property *properties, + pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices); /// Selects the most appropriate device binary based on runtime information /// and the IR characteristics. /// -pi_result piextDeviceSelectBinary( - pi_device device, - pi_device_binary * binaries, - pi_uint32 num_binaries, - pi_device_binary * selected_binary); +pi_result piextDeviceSelectBinary(pi_device device, pi_device_binary *binaries, + pi_uint32 num_binaries, + pi_device_binary *selected_binary); /// Retrieves a device function pointer to a user-defined function /// \arg \c function_name. \arg \c function_pointer_ret is set to 0 if query @@ -620,33 +717,23 @@ pi_result piextDeviceSelectBinary( /// must present in the list of devices returned by \c get_device method for /// \arg \c program. /// -pi_result piextGetDeviceFunctionPointer( - pi_device device, - pi_program program, - const char * function_name, - pi_uint64 * function_pointer_ret); +pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program, + const char *function_name, + pi_uint64 *function_pointer_ret); // // Context // -pi_result piContextCreate( - const cl_context_properties * properties, // TODO: untie from OpenCL - pi_uint32 num_devices, - const pi_device * devices, - void (* pfn_notify)( - const char * errinfo, - const void * private_info, - size_t cb, - void * user_data), - void * user_data, - pi_context * ret_context); - -pi_result piContextGetInfo( - pi_context context, - pi_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piContextCreate(const pi_context_properties *properties, + pi_uint32 num_devices, const pi_device *devices, + void (*pfn_notify)(const char *errinfo, + const void *private_info, + size_t cb, void *user_data), + void *user_data, pi_context *ret_context); + +pi_result piContextGetInfo(pi_context context, pi_context_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); pi_result piContextRetain(pi_context context); @@ -655,18 +742,12 @@ pi_result piContextRelease(pi_context context); // // Queue // -pi_result piQueueCreate( - pi_context context, - pi_device device, - pi_queue_properties properties, - pi_queue * queue); - -pi_result piQueueGetInfo( - pi_queue command_queue, - pi_queue_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piQueueCreate(pi_context context, pi_device device, + pi_queue_properties properties, pi_queue *queue); + +pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); pi_result piQueueRetain(pi_queue command_queue); @@ -677,118 +758,75 @@ pi_result piQueueFinish(pi_queue command_queue); // // Memory // -pi_result piMemBufferCreate( - pi_context context, - pi_mem_flags flags, - size_t size, - void * host_ptr, - pi_mem * ret_mem); - -pi_result piMemImageCreate( - pi_context context, - pi_mem_flags flags, - const pi_image_format * image_format, - const pi_image_desc * image_desc, - void * host_ptr, - pi_mem * ret_mem); - -pi_result piMemGetInfo( - pi_mem mem, - cl_mem_info param_name, // TODO: untie from OpenCL - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -pi_result piMemImageGetInfo ( - pi_mem image, - pi_image_info param_name, - size_t param_value_size, - void * param_value , - size_t * param_value_size_ret); - -pi_result piMemRetain( - pi_mem mem); - -pi_result piMemRelease( - pi_mem mem); - - -pi_result piMemBufferPartition( - pi_mem buffer, - pi_mem_flags flags, - pi_buffer_create_type buffer_create_type, - void * buffer_create_info, - pi_mem * ret_mem); +pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, + void *host_ptr, pi_mem *ret_mem); + +pi_result piMemImageCreate(pi_context context, pi_mem_flags flags, + const pi_image_format *image_format, + const pi_image_desc *image_desc, void *host_ptr, + pi_mem *ret_mem); + +pi_result piMemGetInfo(pi_mem mem, + cl_mem_info param_name, // TODO: untie from OpenCL + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result piMemImageGetInfo(pi_mem image, pi_image_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result piMemRetain(pi_mem mem); + +pi_result piMemRelease(pi_mem mem); + +pi_result piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, + pi_buffer_create_type buffer_create_type, + void *buffer_create_info, pi_mem *ret_mem); // // Program // -pi_result piProgramCreate( - pi_context context, - const void * il, - size_t length, - pi_program * res_program); - -pi_result piclProgramCreateWithSource( - pi_context context, - pi_uint32 count, - const char ** strings, - const size_t * lengths, - pi_program * ret_program); - -pi_result piclProgramCreateWithBinary( - pi_context context, - pi_uint32 num_devices, - const pi_device * device_list, - const size_t * lengths, - const unsigned char ** binaries, - pi_int32 * binary_status, - pi_program * ret_program); - -pi_result piProgramGetInfo( - pi_program program, - pi_program_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -pi_result piProgramLink( - pi_context context, - pi_uint32 num_devices, - const pi_device * device_list, - const char * options, - pi_uint32 num_input_programs, - const pi_program * input_programs, - void (* pfn_notify)(pi_program program, - void * user_data), - void * user_data, - pi_program * ret_program); +pi_result piProgramCreate(pi_context context, const void *il, size_t length, + pi_program *res_program); + +pi_result piclProgramCreateWithSource(pi_context context, pi_uint32 count, + const char **strings, + const size_t *lengths, + pi_program *ret_program); + +pi_result piclProgramCreateWithBinary(pi_context context, pi_uint32 num_devices, + const pi_device *device_list, + const size_t *lengths, + const unsigned char **binaries, + pi_int32 *binary_status, + pi_program *ret_program); + +pi_result piProgramGetInfo(pi_program program, pi_program_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result piProgramLink(pi_context context, pi_uint32 num_devices, + const pi_device *device_list, const char *options, + pi_uint32 num_input_programs, + const pi_program *input_programs, + void (*pfn_notify)(pi_program program, void *user_data), + void *user_data, pi_program *ret_program); pi_result piProgramCompile( - pi_program program, - pi_uint32 num_devices, - const pi_device * device_list, - const char * options, - pi_uint32 num_input_headers, - const pi_program * input_headers, - const char ** header_include_names, - void (* pfn_notify)(pi_program program, void * user_data), - void * user_data); - -pi_result piProgramBuild( - pi_program program, - pi_uint32 num_devices, - const pi_device * device_list, - const char * options, - void (* pfn_notify)(pi_program program, void * user_data), - void * user_data); + pi_program program, pi_uint32 num_devices, const pi_device *device_list, + const char *options, pi_uint32 num_input_headers, + const pi_program *input_headers, const char **header_include_names, + void (*pfn_notify)(pi_program program, void *user_data), void *user_data); + +pi_result piProgramBuild(pi_program program, pi_uint32 num_devices, + const pi_device *device_list, const char *options, + void (*pfn_notify)(pi_program program, + void *user_data), + void *user_data); pi_result piProgramGetBuildInfo( - pi_program program, - pi_device device, - cl_program_build_info param_name, // TODO: untie from OpenCL - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); + pi_program program, pi_device device, + cl_program_build_info param_name, // TODO: untie from OpenCL + size_t param_value_size, void *param_value, size_t *param_value_size_ret); pi_result piProgramRetain(pi_program program); @@ -802,50 +840,35 @@ typedef enum { /// indicates that the kernel might access data through USM ptrs PI_USM_INDIRECT_ACCESS, /// provides an explicit list of pointers that the kernel will access - PI_USM_PTRS = CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL + PI_USM_PTRS = CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL } _pi_kernel_exec_info; -typedef _pi_kernel_exec_info pi_kernel_exec_info; - -pi_result piKernelCreate( - pi_program program, - const char * kernel_name, - pi_kernel * ret_kernel); - -pi_result piKernelSetArg( - pi_kernel kernel, - pi_uint32 arg_index, - size_t arg_size, - const void * arg_value); - -pi_result piKernelGetInfo( - pi_kernel kernel, - pi_kernel_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -pi_result piKernelGetGroupInfo( - pi_kernel kernel, - pi_device device, - pi_kernel_group_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +typedef _pi_kernel_exec_info pi_kernel_exec_info; + +pi_result piKernelCreate(pi_program program, const char *kernel_name, + pi_kernel *ret_kernel); + +pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, + const void *arg_value); + +pi_result piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, + pi_kernel_group_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); pi_result piKernelGetSubGroupInfo( - pi_kernel kernel, - pi_device device, - cl_kernel_sub_group_info param_name, // TODO: untie from OpenCL - size_t input_value_size, - const void* input_value, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); + pi_kernel kernel, pi_device device, + cl_kernel_sub_group_info param_name, // TODO: untie from OpenCL + size_t input_value_size, const void *input_value, size_t param_value_size, + void *param_value, size_t *param_value_size_ret); -pi_result piKernelRetain(pi_kernel kernel); +pi_result piKernelRetain(pi_kernel kernel); -pi_result piKernelRelease(pi_kernel kernel); +pi_result piKernelRelease(pi_kernel kernel); /// Sets up pointer arguments for CL kernels. An extra indirection /// is required due to CL argument conventions. @@ -854,11 +877,8 @@ pi_result piKernelRelease(pi_kernel kernel); /// @param arg_index is the index of the kernel argument /// @param arg_size is the size in bytes of the argument (ignored in CL) /// @param arg_value is the pointer argument -pi_result piextKernelSetArgPointer( - pi_kernel kernel, - pi_uint32 arg_index, - size_t arg_size, - const void * arg_value); +pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, + size_t arg_size, const void *arg_value); /// API to set attributes controlling kernel execution /// @@ -877,39 +897,29 @@ pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, // // Events // -pi_result piEventCreate( - pi_context context, - pi_event * ret_event); - -pi_result piEventGetInfo( - pi_event event, - cl_event_info param_name, // TODO: untie from OpenCL - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -pi_result piEventGetProfilingInfo( - pi_event event, - cl_profiling_info param_name, // TODO: untie from OpenCL - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -pi_result piEventsWait( - pi_uint32 num_events, - const pi_event * event_list); - -pi_result piEventSetCallback( - pi_event event, - pi_int32 command_exec_callback_type, - void (* pfn_notify)(pi_event event, - pi_int32 event_command_status, - void * user_data), - void * user_data); - -pi_result piEventSetStatus( - pi_event event, - pi_int32 execution_status); +pi_result piEventCreate(pi_context context, pi_event *ret_event); + +pi_result piEventGetInfo(pi_event event, + cl_event_info param_name, // TODO: untie from OpenCL + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result +piEventGetProfilingInfo(pi_event event, + cl_profiling_info param_name, // TODO: untie from OpenCL + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list); + +pi_result piEventSetCallback(pi_event event, + pi_int32 command_exec_callback_type, + void (*pfn_notify)(pi_event event, + pi_int32 event_command_status, + void *user_data), + void *user_data); + +pi_result piEventSetStatus(pi_event event, pi_int32 execution_status); pi_result piEventRetain(pi_event event); @@ -918,17 +928,13 @@ pi_result piEventRelease(pi_event event); // // Sampler // -pi_result piSamplerCreate( - pi_context context, - const pi_sampler_properties * sampler_properties, - pi_sampler * result_sampler); - -pi_result piSamplerGetInfo( - pi_sampler sampler, - pi_sampler_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piSamplerCreate(pi_context context, + const pi_sampler_properties *sampler_properties, + pi_sampler *result_sampler); + +pi_result piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); pi_result piSamplerRetain(pi_sampler sampler); @@ -938,232 +944,156 @@ pi_result piSamplerRelease(pi_sampler sampler); // Queue Commands // pi_result piEnqueueKernelLaunch( - pi_queue queue, - pi_kernel kernel, - pi_uint32 work_dim, - const size_t * global_work_offset, - const size_t * global_work_size, - const size_t * local_work_size, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueNativeKernel( - pi_queue queue, - void (*user_func)(void *), - void * args, - size_t cb_args, - pi_uint32 num_mem_objects, - const pi_mem * mem_list, - const void ** args_mem_loc, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueEventsWait( - pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferRead( - pi_queue queue, - pi_mem buffer, - pi_bool blocking_read, - size_t offset, - size_t size, - void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferReadRect( - pi_queue command_queue, - pi_mem buffer, - pi_bool blocking_read, - const size_t * buffer_offset, - const size_t * host_offset, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferWrite( - pi_queue command_queue, - pi_mem buffer, - pi_bool blocking_write, - size_t offset, - size_t size, - const void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferWriteRect( - pi_queue command_queue, - pi_mem buffer, - pi_bool blocking_write, - const size_t * buffer_offset, - const size_t * host_offset, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - const void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferCopy( - pi_queue command_queue, - pi_mem src_buffer, - pi_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); + pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result +piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, + size_t cb_args, pi_uint32 num_mem_objects, + const pi_mem *mem_list, const void **args_mem_loc, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piEnqueueEventsWait(pi_queue command_queue, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piEnqueueMemBufferRead(pi_queue queue, pi_mem buffer, + pi_bool blocking_read, size_t offset, + size_t size, void *ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result +piEnqueueMemBufferReadRect(pi_queue command_queue, pi_mem buffer, + pi_bool blocking_read, const size_t *buffer_offset, + const size_t *host_offset, const size_t *region, + size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, + pi_bool blocking_write, size_t offset, + size_t size, const void *ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result +piEnqueueMemBufferWriteRect(pi_queue command_queue, pi_mem buffer, + pi_bool blocking_write, const size_t *buffer_offset, + const size_t *host_offset, const size_t *region, + size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, + pi_mem dst_buffer, size_t src_offset, + size_t dst_offset, size_t size, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); pi_result piEnqueueMemBufferCopyRect( - pi_queue command_queue, - pi_mem src_buffer, - pi_mem dst_buffer, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemBufferFill( - pi_queue command_queue, - pi_mem buffer, - const void * pattern, - size_t pattern_size, - size_t offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemImageRead( - pi_queue command_queue, - pi_mem image, - pi_bool blocking_read, - const size_t * origin, - const size_t * region, - size_t row_pitch, - size_t slice_pitch, - void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemImageWrite( - pi_queue command_queue, - pi_mem image, - pi_bool blocking_write, - const size_t * origin, - const size_t * region, - size_t input_row_pitch, - size_t input_slice_pitch, - const void * ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemImageCopy( - pi_queue command_queue, - pi_mem src_image, - pi_mem dst_image, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piEnqueueMemImageFill( - pi_queue command_queue, - pi_mem image, - const void * fill_color, - const size_t * origin, - const size_t * region, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); + pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, + const void *pattern, size_t pattern_size, + size_t offset, size_t size, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, + pi_bool blocking_read, const size_t *origin, + const size_t *region, size_t row_pitch, + size_t slice_pitch, void *ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, + pi_bool blocking_write, const size_t *origin, + const size_t *region, size_t input_row_pitch, + size_t input_slice_pitch, const void *ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, + pi_mem dst_image, const size_t *src_origin, + const size_t *dst_origin, const size_t *region, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + +pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, + const void *fill_color, const size_t *origin, + const size_t *region, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); pi_result piEnqueueMemBufferMap( - pi_queue command_queue, - pi_mem buffer, - pi_bool blocking_map, - cl_map_flags map_flags, // TODO: untie from OpenCL - size_t offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event, - void* * ret_map); - -pi_result piEnqueueMemUnmap( - pi_queue command_queue, - pi_mem memobj, - void * mapped_ptr, - pi_uint32 num_events_in_wait_list, - const pi_event * event_wait_list, - pi_event * event); - -pi_result piextKernelSetArgMemObj( - pi_kernel kernel, - pi_uint32 arg_index, - const pi_mem *arg_value); + pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, + cl_map_flags map_flags, // TODO: untie from OpenCL + size_t offset, size_t size, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event, void **ret_map); + +pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, + void *mapped_ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + +pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, + const pi_mem *arg_value); /// // USM /// typedef enum { - PI_USM_HOST_SUPPORT = CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, - PI_USM_DEVICE_SUPPORT = CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL, - PI_USM_SINGLE_SHARED_SUPPORT = CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, - PI_USM_CROSS_SHARED_SUPPORT = CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, + PI_USM_HOST_SUPPORT = CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, + PI_USM_DEVICE_SUPPORT = CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL, + PI_USM_SINGLE_SHARED_SUPPORT = + CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, + PI_USM_CROSS_SHARED_SUPPORT = + CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, PI_USM_SYSTEM_SHARED_SUPPORT = CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL } _pi_usm_capability_query; typedef enum : pi_bitfield { - PI_USM_ACCESS = CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL, - PI_USM_ATOMIC_ACCESS = CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL, - PI_USM_CONCURRENT_ACCESS = CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL, - PI_USM_CONCURRENT_ATOMIC_ACCESS = CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL + PI_USM_ACCESS = CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL, + PI_USM_ATOMIC_ACCESS = CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL, + PI_USM_CONCURRENT_ACCESS = CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL, + PI_USM_CONCURRENT_ATOMIC_ACCESS = + CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL } _pi_usm_capabilities; typedef enum { - PI_MEM_ALLOC_TYPE = CL_MEM_ALLOC_TYPE_INTEL, - PI_MEM_ALLOC_BASE_PTR = CL_MEM_ALLOC_BASE_PTR_INTEL, - PI_MEM_ALLOC_SIZE = CL_MEM_ALLOC_SIZE_INTEL, - PI_MEM_ALLOC_DEVICE = CL_MEM_ALLOC_DEVICE_INTEL, - PI_MEM_ALLOC_INFO_TBD0 = CL_MEM_ALLOC_INFO_TBD0_INTEL, - PI_MEM_ALLOC_INFO_TBD1 = CL_MEM_ALLOC_INFO_TBD1_INTEL, + PI_MEM_ALLOC_TYPE = CL_MEM_ALLOC_TYPE_INTEL, + PI_MEM_ALLOC_BASE_PTR = CL_MEM_ALLOC_BASE_PTR_INTEL, + PI_MEM_ALLOC_SIZE = CL_MEM_ALLOC_SIZE_INTEL, + PI_MEM_ALLOC_DEVICE = CL_MEM_ALLOC_DEVICE_INTEL, + PI_MEM_ALLOC_INFO_TBD0 = CL_MEM_ALLOC_INFO_TBD0_INTEL, + PI_MEM_ALLOC_INFO_TBD1 = CL_MEM_ALLOC_INFO_TBD1_INTEL, } _pi_mem_info; typedef enum { PI_MEM_TYPE_UNKNOWN = CL_MEM_TYPE_UNKNOWN_INTEL, - PI_MEM_TYPE_HOST = CL_MEM_TYPE_HOST_INTEL, - PI_MEM_TYPE_DEVICE = CL_MEM_TYPE_DEVICE_INTEL, - PI_MEM_TYPE_SHARED = CL_MEM_TYPE_SHARED_INTEL + PI_MEM_TYPE_HOST = CL_MEM_TYPE_HOST_INTEL, + PI_MEM_TYPE_DEVICE = CL_MEM_TYPE_DEVICE_INTEL, + PI_MEM_TYPE_SHARED = CL_MEM_TYPE_SHARED_INTEL } _pi_usm_type; -typedef enum : pi_bitfield { +typedef enum : pi_bitfield { PI_MEM_ALLOC_FLAGS = CL_MEM_ALLOC_FLAGS_INTEL } _pi_usm_mem_properties; @@ -1171,12 +1101,12 @@ typedef enum : pi_bitfield { PI_USM_MIGRATION_TBD0 = (1 << 0) } _pi_usm_migration_flags; -typedef _pi_usm_capability_query pi_usm_capability_query; -typedef _pi_usm_capabilities pi_usm_capabilities; -typedef _pi_mem_info pi_mem_info; -typedef _pi_usm_type pi_usm_type; -typedef _pi_usm_mem_properties pi_usm_mem_properties; -typedef _pi_usm_migration_flags pi_usm_migration_flags; +typedef _pi_usm_capability_query pi_usm_capability_query; +typedef _pi_usm_capabilities pi_usm_capabilities; +typedef _pi_mem_info pi_mem_info; +typedef _pi_usm_type pi_usm_type; +typedef _pi_usm_mem_properties pi_usm_mem_properties; +typedef _pi_usm_migration_flags pi_usm_migration_flags; /// Allocates host memory accessible by the device. /// @@ -1185,12 +1115,9 @@ typedef _pi_usm_migration_flags pi_usm_migration_flags; /// @param pi_usm_mem_properties are optional allocation properties /// @param size_t is the size of the allocation /// @param alignment is the desired alignment of the allocation -pi_result piextUSMHostAlloc( - void ** result_ptr, - pi_context context, - pi_usm_mem_properties * properties, - size_t size, - pi_uint32 alignment); +pi_result piextUSMHostAlloc(void **result_ptr, pi_context context, + pi_usm_mem_properties *properties, size_t size, + pi_uint32 alignment); /// Allocates device memory /// @@ -1200,13 +1127,10 @@ pi_result piextUSMHostAlloc( /// @param pi_usm_mem_properties are optional allocation properties /// @param size_t is the size of the allocation /// @param alignment is the desired alignment of the allocation -pi_result piextUSMDeviceAlloc( - void ** result_ptr, - pi_context context, - pi_device device, - pi_usm_mem_properties * properties, - size_t size, - pi_uint32 alignment); +pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, + pi_device device, + pi_usm_mem_properties *properties, size_t size, + pi_uint32 alignment); /// Allocates memory accessible on both host and device /// @@ -1216,40 +1140,32 @@ pi_result piextUSMDeviceAlloc( /// @param pi_usm_mem_properties are optional allocation properties /// @param size_t is the size of the allocation /// @param alignment is the desired alignment of the allocation -pi_result piextUSMSharedAlloc( - void ** result_ptr, - pi_context context, - pi_device device, - pi_usm_mem_properties * properties, - size_t size, - pi_uint32 alignment); +pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, + pi_device device, + pi_usm_mem_properties *properties, size_t size, + pi_uint32 alignment); /// Frees allocated USM memory /// /// @param context is the pi_context of the allocation /// @param ptr is the memory to be freed -pi_result piextUSMFree( - pi_context context, - void * ptr); +pi_result piextUSMFree(pi_context context, void *ptr); /// USM Memset API /// /// @param queue is the queue to submit to /// @param ptr is the ptr to memset -/// @param value is value to set. It is interpreted as an 8-bit value and the upper +/// @param value is value to set. It is interpreted as an 8-bit value and the +/// upper /// 24 bits are ignored /// @param count is the size in bytes to memset /// @param num_events_in_waitlist is the number of events to wait on /// @param events_waitlist is an array of events to wait on /// @param event is the event that represents this operation -pi_result piextUSMEnqueueMemset( - pi_queue queue, - void * ptr, - pi_int32 value, - size_t count, - pi_uint32 num_events_in_waitlist, - const pi_event * events_waitlist, - pi_event * event); +pi_result piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, + size_t count, pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, + pi_event *event); /// USM Memcpy API /// @@ -1261,15 +1177,11 @@ pi_result piextUSMEnqueueMemset( /// @param num_events_in_waitlist is the number of events to wait on /// @param events_waitlist is an array of events to wait on /// @param event is the event that represents this operation -pi_result piextUSMEnqueueMemcpy( - pi_queue queue, - pi_bool blocking, - void * dst_ptr, - const void * src_ptr, - size_t size, - pi_uint32 num_events_in_waitlist, - const pi_event * events_waitlist, - pi_event * event); +pi_result piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, + const void *src_ptr, size_t size, + pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, + pi_event *event); /// Hint to migrate memory to the device /// @@ -1280,14 +1192,11 @@ pi_result piextUSMEnqueueMemcpy( /// @param num_events_in_waitlist is the number of events to wait on /// @param events_waitlist is an array of events to wait on /// @param event is the event that represents this operation -pi_result piextUSMEnqueuePrefetch( - pi_queue queue, - const void * ptr, - size_t size, - pi_usm_migration_flags flags, - pi_uint32 num_events_in_waitlist, - const pi_event * events_waitlist, - pi_event * event); +pi_result piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, size_t size, + pi_usm_migration_flags flags, + pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, + pi_event *event); /// USM Memadvise API /// @@ -1297,12 +1206,8 @@ pi_result piextUSMEnqueuePrefetch( /// @param advice is device specific advice /// @param event is the event that represents this operation // USM memadvise API to govern behavior of automatic migration mechanisms -pi_result piextUSMEnqueueMemAdvise( - pi_queue queue, - const void * ptr, - size_t length, - int advice, - pi_event * event); +pi_result piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, + size_t length, int advice, pi_event *event); /// API to query information about USM allocated pointers /// Valid Queries: @@ -1320,13 +1225,10 @@ pi_result piextUSMEnqueueMemAdvise( /// @param param_value_size is the size of the result in bytes /// @param param_value is the result /// @param param_value_ret is how many bytes were written -pi_result piextUSMGetMemAllocInfo( - pi_context context, - const void * ptr, - pi_mem_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); +pi_result piextUSMGetMemAllocInfo(pi_context context, const void *ptr, + pi_mem_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); struct _pi_plugin { // PI version supported by host passed to the plugin. The Plugin diff --git a/sycl/include/CL/sycl/intel/sub_group_host.hpp b/sycl/include/CL/sycl/intel/sub_group_host.hpp index d6fade163b117..16d9514fa4333 100644 --- a/sycl/include/CL/sycl/intel/sub_group_host.hpp +++ b/sycl/include/CL/sycl/intel/sub_group_host.hpp @@ -136,7 +136,7 @@ struct sub_group { } template - void store(multi_ptr dst, T &x) const { + void store(multi_ptr dst, const T &x) const { throw runtime_error("Subgroups are not supported on host device. "); } diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp index bb811decac723..489216dfbf68c 100644 --- a/sycl/include/CL/sycl/queue.hpp +++ b/sycl/include/CL/sycl/queue.hpp @@ -239,7 +239,7 @@ class queue { /// @param Length is a number of bytes in the allocation. /// @param Advice is a device-defined advice for the specified allocation. /// @return an event representing advice operation. - event mem_advise(const void *Ptr, size_t Length, int Advice); + event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice); /// Provides hints to the runtime library that data should be made available /// on a device earlier than Unified Shared Memory would normally require it diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index fe42b1d8dc3a1..bc5fcdc6591b2 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -528,43 +528,57 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms) { try { - static constexpr pi_uint32 numPlatforms = 1; + static std::once_flag initFlag; + static pi_uint32 numPlatforms = 1; + static _pi_platform platformId; - if (num_platforms != nullptr) { - *num_platforms = numPlatforms; + if (num_entries == 0 and platforms != nullptr) { + return PI_INVALID_VALUE; + } + if (platforms == nullptr and num_platforms == nullptr) { + return PI_INVALID_VALUE; } pi_result err = PI_SUCCESS; - if (platforms != nullptr) { - - assert(num_entries != 0); - - static std::once_flag initFlag; - static _pi_platform platformId; - std::call_once( - initFlag, - [](pi_result &err) { - err = PI_CHECK_ERROR(cuInit(0)); - - int numDevices = 0; - err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices)); + std::call_once( + initFlag, + [](pi_result &err) { + if (cuInit(0) != CUDA_SUCCESS) { + numPlatforms = 0; + return; + } + int numDevices = 0; + err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices)); + if (numDevices == 0) { + numPlatforms = 0; + return; + } + try { platformId.devices_.reserve(numDevices); - try { - for (int i = 0; i < numDevices; ++i) { - CUdevice device; - err = PI_CHECK_ERROR(cuDeviceGet(&device, i)); - platformId.devices_.emplace_back( - new _pi_device{device, &platformId}); - } - } catch (...) { - // Clear and rethrow to allow retry - platformId.devices_.clear(); - throw; + for (int i = 0; i < numDevices; ++i) { + CUdevice device; + err = PI_CHECK_ERROR(cuDeviceGet(&device, i)); + platformId.devices_.emplace_back( + new _pi_device{device, &platformId}); } - }, - err); + } catch (const std::bad_alloc &) { + // Signal out-of-memory situation + platformId.devices_.clear(); + err = PI_OUT_OF_HOST_MEMORY; + } catch (...) { + // Clear and rethrow to allow retry + platformId.devices_.clear(); + throw; + } + }, + err); + if (num_platforms != nullptr) { + *num_platforms = numPlatforms; + } + + if (platforms != nullptr) { *platforms = &platformId; } @@ -1110,12 +1124,30 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, } /* Context APIs */ -pi_result cuda_piContextCreate(const cl_context_properties *properties, - pi_uint32 num_devices, const pi_device *devices, - void (*pfn_notify)(const char *errinfo, - const void *private_info, - size_t cb, void *user_data), - void *user_data, pi_context *retcontext) { + +/// Create a PI CUDA context. +/// +/// By default creates a scoped context and keeps the last active CUDA context +/// on top of the CUDA context stack. +/// With the PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of PI_TRUE +/// creates a primary CUDA context and activates it on the CUDA context stack. +/// +/// @param[in] properties 0 terminated array of key/id-value combinations. Can +/// be nullptr. Only accepts property key/id PI_CONTEXT_PROPERTIES_CUDA_PRIMARY +/// with a pi_bool value. +/// @param[in] num_devices Number of devices to create the context for. +/// @param[in] devices Devices to create the context for. +/// @param[in] pfn_notify Callback, currently unused. +/// @param[in] user_data User data for callback. +/// @param[out] retcontext Set to created context on success. +/// +/// @return PI_SUCCESS on success, otherwise an error return code. +pi_result cuda_piContextCreate(const pi_context_properties *properties, + pi_uint32 num_devices, const pi_device *devices, + void (*pfn_notify)(const char *errinfo, + const void *private_info, + size_t cb, void *user_data), + void *user_data, pi_context *retcontext) { assert(devices != nullptr); // TODO: How to implement context callback? @@ -1127,31 +1159,51 @@ pi_result cuda_piContextCreate(const cl_context_properties *properties, assert(retcontext != nullptr); pi_result errcode_ret = PI_SUCCESS; + // Parse properties. + bool property_cuda_primary = false; + while (properties && (0 != *properties)) { + // Consume property ID. + pi_context_properties id = *properties; + ++properties; + // Consume property value. + pi_context_properties value = *properties; + ++properties; + switch (id) { + case PI_CONTEXT_PROPERTIES_CUDA_PRIMARY: + assert(value == PI_FALSE || value == PI_TRUE); + property_cuda_primary = static_cast(value); + break; + default: + // Unknown property. + assert(!"Unknown piContextCreate property in property list"); + return PI_INVALID_VALUE; + } + } + std::unique_ptr<_pi_context> piContextPtr{nullptr}; try { - if (properties && *properties != PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) { - throw pi_result(CL_INVALID_VALUE); - } else if (!properties) { + if (property_cuda_primary) { + // Use the CUDA primary context and assume that we want to use it + // immediately as we want to forge context switches. + CUcontext Ctxt; + errcode_ret = PI_CHECK_ERROR( + cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->cuDevice_)); + piContextPtr = std::unique_ptr<_pi_context>( + new _pi_context{_pi_context::kind::primary, Ctxt, *devices}); + errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt)); + } else { + // Create a scoped context. CUcontext newContext, current; PI_CHECK_ERROR(cuCtxGetCurrent(¤t)); - errcode_ret = PI_CHECK_ERROR(cuCtxCreate(&newContext, CU_CTX_MAP_HOST, - (*devices)->cuDevice_)); + errcode_ret = PI_CHECK_ERROR( + cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->cuDevice_)); piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{ _pi_context::kind::user_defined, newContext, *devices}); + // For scoped contexts keep the last active CUDA one on top of the stack + // as `cuCtxCreate` replaces it implicitly otherwise. if (current != nullptr) { - // If there was an existing context on the thread we recover it PI_CHECK_ERROR(cuCtxSetCurrent(current)); } - } else if (properties - && *properties == PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) { - CUcontext Ctxt; - errcode_ret = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain( - &Ctxt, (*devices)->cuDevice_)); - piContextPtr = std::unique_ptr<_pi_context>( - new _pi_context{_pi_context::kind::primary, Ctxt, *devices}); - errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt)); - } else { - throw pi_result(CL_INVALID_VALUE); } *retcontext = piContextPtr.release(); @@ -1178,11 +1230,14 @@ pi_result cuda_piContextRelease(pi_context ctxt) { CUcontext cuCtxt = ctxt->get(); CUcontext current = nullptr; cuCtxGetCurrent(¤t); - if(cuCtxt != current) - { - PI_CHECK_ERROR(cuCtxSetCurrent(cuCtxt)); + if (cuCtxt != current) { + PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt)); } PI_CHECK_ERROR(cuCtxSynchronize()); + cuCtxGetCurrent(¤t); + if (cuCtxt == current) { + PI_CHECK_ERROR(cuCtxPopCurrent(¤t)); + } return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt)); } else { // Primary context is not destroyed, but released @@ -1253,6 +1308,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) { pi_result ret = PI_SUCCESS; try { + // Do nothing if there are other references if (memObj->decrement_reference_count() > 0) { return PI_SUCCESS; @@ -1263,7 +1319,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) { if (!memObj->is_sub_buffer()) { - ScopedContext(uniqueMemObj->get_context()); + ScopedContext active(uniqueMemObj->get_context()); switch (uniqueMemObj->allocMode_) { case _pi_mem::alloc_mode::classic: diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 2d32088886560..97f084207976a 100755 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -451,12 +451,12 @@ pi_result OCL(piextGetDeviceFunctionPointer)(pi_device device, function_pointer_ret)); } -pi_result OCL(piContextCreate)( - const cl_context_properties *properties, // TODO: untie from OpenCL - pi_uint32 num_devices, const pi_device *devices, - void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, - void *user_data1), - void *user_data, pi_context *retcontext) { +pi_result OCL(piContextCreate)(const pi_context_properties *properties, + pi_uint32 num_devices, const pi_device *devices, + void (*pfn_notify)(const char *errinfo, + const void *private_info, + size_t cb, void *user_data1), + void *user_data, pi_context *retcontext) { pi_result ret = PI_INVALID_OPERATION; *retcontext = cast( clCreateContext(properties, cast(num_devices), diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 7039f30ee9401..1eced4ec82ce0 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -44,9 +44,8 @@ context_impl::context_impl(const vector_class Devices, if (MPlatform->is_cuda()) { #if USE_PI_CUDA - const cl_context_properties props[] = { - PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, - 0}; + const pi_context_properties props[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, + UseCUDAPrimaryContext, 0}; getPlugin().call(props, DeviceIds.size(), DeviceIds.data(), nullptr, nullptr, &MContext); diff --git a/sycl/source/detail/error_handling/enqueue_kernel.cpp b/sycl/source/detail/error_handling/enqueue_kernel.cpp index 5d733ca7bbfe2..3a7ccf82dd9df 100644 --- a/sycl/source/detail/error_handling/enqueue_kernel.cpp +++ b/sycl/source/detail/error_handling/enqueue_kernel.cpp @@ -40,8 +40,8 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, size_t CompileWGSize[3] = {0}; Plugin.call( - Kernel, Device, PI_KERNEL_COMPILE_GROUP_INFO_SIZE, sizeof(size_t) * 3, - CompileWGSize, nullptr); + Kernel, Device, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(size_t) * 3, CompileWGSize, nullptr); if (CompileWGSize[0] != 0) { // OpenCL 1.x && 2.0: @@ -90,10 +90,11 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the // total number of work-items in the work-group computed as // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater - // than the value specified by PI_KERNEL_GROUP_INFO_SIZE in table 5.21. + // than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in + // table 5.21. size_t KernelWGSize = 0; Plugin.call( - Kernel, Device, PI_KERNEL_GROUP_INFO_SIZE, sizeof(size_t), + Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t), &KernelWGSize, nullptr); const size_t TotalNumberOfWIs = NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index a7b3b467b8988..dea1f622b1ca4 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -74,7 +74,7 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext) RT::PiContext TempContext; getPlugin().call( - MEvent, CL_EVENT_CONTEXT, sizeof(RT::PiContext), &TempContext, nullptr); + MEvent, PI_EVENT_INFO_CONTEXT, sizeof(RT::PiContext), &TempContext, nullptr); if (MContext->getHandleRef() != TempContext) { throw cl::sycl::invalid_parameter_error( "The syclContext must match the OpenCL context associated with the " @@ -95,16 +95,14 @@ event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) { void event_impl::wait( std::shared_ptr Self) const { - if (MEvent) // presence of MEvent means the command has been enqueued, so no need to // go via the slow path event waiting in the scheduler waitInternal(); else if (MCommand) - detail::Scheduler::getInstance().waitForEvent(std::move(Self)); + detail::Scheduler::getInstance().waitForEvent(Self); if (MCommand && !SYCLConfig::get()) - detail::Scheduler::getInstance().cleanupFinishedCommands( - static_cast(MCommand)); + detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); } void event_impl::wait_and_throw( diff --git a/sycl/source/detail/event_info.hpp b/sycl/source/detail/event_info.hpp index e2ba912cd6d70..e3f487f063ce7 100644 --- a/sycl/source/detail/event_info.hpp +++ b/sycl/source/detail/event_info.hpp @@ -36,7 +36,7 @@ template struct get_event_info { static RetType get(RT::PiEvent Event, const plugin &Plugin) { RetType Result = (RetType)0; // TODO catch an exception and put it to list of asynchronous exceptions - Plugin.call(Event, cl_profiling_info(Param), + Plugin.call(Event, pi_event_info(Param), sizeof(Result), &Result, nullptr); return Result; } diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index 915c210393579..6339213dcb090 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -81,10 +81,10 @@ program_impl::program_impl(ContextImplPtr Context, RT::PiProgram Program) : MProgram(Program), MContext(Context), MLinkable(true) { // TODO handle the case when cl_program build is in progress - cl_uint NumDevices; + pi_uint32 NumDevices; const detail::plugin &Plugin = getPlugin(); Plugin.call( - Program, PI_PROGRAM_INFO_NUM_DEVICES, sizeof(cl_uint), &NumDevices, nullptr); + Program, PI_PROGRAM_INFO_NUM_DEVICES, sizeof(pi_uint32), &NumDevices, nullptr); vector_class PiDevices(NumDevices); Plugin.call(Program, PI_PROGRAM_INFO_DEVICES, sizeof(RT::PiDevice) * NumDevices, @@ -402,10 +402,10 @@ cl_uint program_impl::get_info() const { if (is_host()) { throw invalid_object_error("This instance of program is a host instance"); } - cl_uint Result; + pi_uint32 Result; const detail::plugin &Plugin = getPlugin(); Plugin.call(MProgram, PI_PROGRAM_INFO_REFERENCE_COUNT, - sizeof(cl_uint), &Result, nullptr); + sizeof(pi_uint32), &Result, nullptr); return Result; } diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index e2959329ce441..74d08711d8643 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -70,7 +70,7 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context, // FIXME: we don't yet support multiple devices with a single binary. const detail::plugin &Plugin = Context->getPlugin(); #ifndef _NDEBUG - cl_uint NumDevices = 0; + pi_uint32 NumDevices = 0; Plugin.call(Context->getHandleRef(), PI_CONTEXT_INFO_NUM_DEVICES, sizeof(NumDevices), &NumDevices, @@ -435,7 +435,7 @@ ProgramManager::getClProgramFromClKernel(RT::PiKernel Kernel, RT::PiProgram Program; const detail::plugin &Plugin = Context->getPlugin(); Plugin.call( - Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(cl_program), &Program, nullptr); + Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(RT::PiProgram), &Program, nullptr); return Program; } @@ -446,8 +446,8 @@ string_class ProgramManager::getProgramBuildLog(const RT::PiProgram &Program, Plugin.call(Program, PI_PROGRAM_INFO_DEVICES, 0, nullptr, &Size); vector_class PIDevices(Size / sizeof(RT::PiDevice)); - Plugin.call(Program, PI_PROGRAM_INFO_DEVICES, Size, - PIDevices.data(), nullptr); + Plugin.call(Program, PI_PROGRAM_INFO_DEVICES, + Size, PIDevices.data(), nullptr); string_class Log = "The program was built for " + std::to_string(PIDevices.size()) + " devices"; for (RT::PiDevice &Device : PIDevices) { diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 5b7690cf5fa36..b079577eb3e3d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -64,7 +64,7 @@ event queue_impl::memcpy(shared_ptr_class Impl, void *Dest, return ResEvent; } -event queue_impl::mem_advise(const void *Ptr, size_t Length, int Advice) { +event queue_impl::mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice) { context Context = get_context(); if (Context.is_host()) { return event(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 2d1fd58e8489a..c8434f0176b1a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -332,7 +332,7 @@ class queue_impl { /// @param Ptr is a USM pointer to the allocation. /// @param Length is a number of bytes in the allocation. /// @param Advice is a device-defined advice for the specified allocation. - event mem_advise(const void *Ptr, size_t Length, int Advice); + event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice); /// Puts exception to the list of asynchronous ecxeptions. /// diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d9859929191f6..6dfdc29d2726e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -934,11 +934,11 @@ cl_int ExecCGCommand::enqueueImp() { pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation(); Plugin.call(Kernel, Arg.MIndex, &MemArg); #else - cl_mem MemArg = (cl_mem)AllocaCmd->getMemAllocation(); + RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_mem), &MemArg); + sizeof(RT::PiMem), &MemArg); Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_mem), &MemArg); + sizeof(RT::PiMem), &MemArg); #endif break; } diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 4da29c0a23299..37c2529d44863 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -123,9 +123,13 @@ void Scheduler::waitForEvent(EventImplPtr Event) { GraphProcessor::waitForEvent(std::move(Event)); } -void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) { +void Scheduler::cleanupFinishedCommands(EventImplPtr FinishedEvent) { std::lock_guard lock(MGraphLock); - MGraphBuilder.cleanupFinishedCommands(FinishedCmd); + Command *FinishedCmd = static_cast(FinishedEvent->getCommand()); + // The command might have been cleaned up (and set to nullptr) by another + // thread + if (FinishedCmd) + MGraphBuilder.cleanupFinishedCommands(FinishedCmd); } void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index e0429510eed1b..90000f6ab558c 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -79,7 +79,7 @@ class Scheduler { // Removes finished non-leaf non-alloca commands from the subgraph (assuming // that all its commands have been waited for). - void cleanupFinishedCommands(Command *FinishedCmd); + void cleanupFinishedCommands(EventImplPtr FinishedEvent); // Creates nodes in the graph, that update Req with the pointer to the host // memory which contains the latest data of the memory object. New diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index c47eba506958d..ea057815b8524 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -102,7 +102,7 @@ event queue::memcpy(void *dest, const void *src, size_t count) { return impl->memcpy(impl, dest, src, count); } -event queue::mem_advise(const void *ptr, size_t length, int advice) { +event queue::mem_advise(const void *ptr, size_t length, pi_mem_advice advice) { return impl->mem_advise(ptr, length, advice); } diff --git a/sycl/test/.clang-format b/sycl/test/.clang-format new file mode 100644 index 0000000000000..4799b66f3e9a6 --- /dev/null +++ b/sycl/test/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: LLVM +ColumnLimit: 0 diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index 95dacdcffe48e..5d891547560fa 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -1,23 +1,11 @@ -set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/") -set(LLVM_BUILD_BINARY_DIRS "${LLVM_BINARY_DIR}/bin/") set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin/") -set(CLANG_IN_BUILD "${LLVM_BINARY_DIR}/bin/clang") -set(CLANGXX_IN_BUILD "${LLVM_BINARY_DIR}/bin/clang++") -set(CLANGCL_IN_BUILD "${LLVM_BINARY_DIR}/bin/clang-cl") - -set(LLVM_DEPLOY_LIBRARY_DIRS "${CMAKE_INSTALL_PREFIX}/lib/") -set(LLVM_DEPLOY_BINARY_DIRS "${CMAKE_INSTALL_PREFIX}/bin/") -set(CLANG_IN_DEPLOY "${CMAKE_INSTALL_PREFIX}/bin/clang") -set(CLANGXX_IN_DEPLOY "${CMAKE_INSTALL_PREFIX}/bin/clang++") -set(CLANGCL_IN_DEPLOY "${CMAKE_INSTALL_PREFIX}/bin/clang-cl") get_target_property(SYCL_BINARY_DIR sycl-toolchain BINARY_DIR) set(SYCL_INCLUDE "${dst_dir}") -set(SYCL_DEPLOY_INCLUDE "${dst_deploy_dir}") set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v") -set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v --config-prefix=deploy-lit") +set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v -D SYCL_TOOLS_DIR=${CMAKE_INSTALL_PREFIX}/bin -D SYCL_LIBS_DIR=${CMAKE_INSTALL_PREFIX}/lib -D SYCL_INCLUDE=${dst_deploy_dir}") configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in @@ -33,13 +21,6 @@ configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py ) -configure_lit_site_cfg( - ${CMAKE_CURRENT_SOURCE_DIR}/deploy-lit.site.cfg.py.in - ${CMAKE_CURRENT_BINARY_DIR}/deploy-lit.site.cfg.py - MAIN_CONFIG - ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py - ) - list(APPEND SYCL_TEST_DEPS sycl-toolchain FileCheck @@ -49,11 +30,8 @@ list(APPEND SYCL_TEST_DEPS ) list(APPEND SYCL_DEPLOY_TEST_DEPS + ${SYCL_TEST_DEPS} deploy-sycl-toolchain - FileCheck - not - get_device_count_by_type - llvm-config ) add_lit_testsuite(check-sycl "Running the SYCL regression tests" @@ -62,10 +40,13 @@ add_lit_testsuite(check-sycl "Running the SYCL regression tests" PARAMS "SYCL_BE=PI_OPENCL" DEPENDS ${SYCL_TEST_DEPS} ) + add_lit_testsuite(check-sycl-deploy "Running the SYCL regression tests" ${CMAKE_CURRENT_BINARY_DIR} ARGS ${DEPLOY_RT_TEST_ARGS} + PARAMS "SYCL_BE=PI_OPENCL" DEPENDS ${SYCL_DEPLOY_TEST_DEPS} + EXCLUDE_FROM_CHECK_ALL ) set_target_properties(check-sycl PROPERTIES FOLDER "SYCL tests") @@ -76,7 +57,7 @@ add_lit_testsuites(SYCL ${CMAKE_CURRENT_SOURCE_DIR} if(SYCL_BUILD_PI_CUDA) add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA" ${CMAKE_CURRENT_BINARY_DIR} - ARGS ${RT_TEST_ARGS} + ARGS ${RT_TEST_ARGS} PARAMS "SYCL_BE=PI_CUDA" DEPENDS ${SYCL_TEST_DEPS} ) diff --git a/sycl/test/basic_tests/handler/interop_task.cpp b/sycl/test/basic_tests/handler/interop_task.cpp index 1857a0e359db5..dba8cf0d8be2d 100644 --- a/sycl/test/basic_tests/handler/interop_task.cpp +++ b/sycl/test/basic_tests/handler/interop_task.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl %s -o %t.out -lOpenCL +// RUN: %clangxx -fsycl %s -o %t.out -L %opencl_libs_dir -lOpenCL // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // REQUIRES: opencl diff --git a/sycl/test/deploy-lit.site.cfg.py.in b/sycl/test/deploy-lit.site.cfg.py.in deleted file mode 100644 index e99cee34e1098..0000000000000 --- a/sycl/test/deploy-lit.site.cfg.py.in +++ /dev/null @@ -1,25 +0,0 @@ -@LIT_SITE_CFG_IN_HEADER@ - -import sys - -config.clang = "@CLANG_IN_DEPLOY@" -config.clangxx = "@CLANGXX_IN_DEPLOY@" -config.clang_cl = "@CLANGCL_IN_DEPLOY@" -config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" -config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" -config.llvm_build_libs_dir = "@LLVM_DEPLOY_LIBRARY_DIRS@" -config.llvm_build_bins_dir = "@LLVM_DEPLOY_BINARY_DIRS@" -config.llvm_binary_dir = "@LLVM_BINARY_DIR@" -config.sycl_include = "@SYCL_DEPLOY_INCLUDE@" -config.sycl_obj_root = "@SYCL_BINARY_DIR@" -config.opencl_lib = "@OpenCL_LIBRARIES@" -config.opencl_libs_dir = os.path.dirname("@OpenCL_LIBRARIES@") - -config.llvm_enable_projects = "@LLVM_ENABLE_PROJECTS@" - - -import lit.llvm -lit.llvm.initialize(lit_config, config) - -# Let the main config do the real work. -lit_config.load_config(config, "@SYCL_SOURCE_DIR@/test/lit.cfg.py") diff --git a/sycl/test/devicelib/assert-windows.cpp b/sycl/test/devicelib/assert-windows.cpp index 1451431b46946..266ec3845d766 100644 --- a/sycl/test/devicelib/assert-windows.cpp +++ b/sycl/test/devicelib/assert-windows.cpp @@ -5,7 +5,7 @@ // XFAIL: * // // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/../bin/libsycl-msvc.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-msvc.o -o %t.out // // MSVC implementation of assert does not call an unreachable built-in, so the // program doesn't terminate when fallback is used. diff --git a/sycl/test/devicelib/assert.cpp b/sycl/test/devicelib/assert.cpp index d0f18fe8cb544..757647322cb64 100644 --- a/sycl/test/devicelib/assert.cpp +++ b/sycl/test/devicelib/assert.cpp @@ -1,6 +1,6 @@ // REQUIRES: cpu,linux // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-glibc.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-glibc.o -o %t.out // (see the other RUN lines below; it is a bit complicated) // // assert() call in device code guarantees nothing: on some devices it behaves diff --git a/sycl/test/devicelib/c99_complex_math_fp64_test.cpp b/sycl/test/devicelib/c99_complex_math_fp64_test.cpp index 7fdd07d4c3cd7..b7da3b6a9e37a 100644 --- a/sycl/test/devicelib/c99_complex_math_fp64_test.cpp +++ b/sycl/test/devicelib/c99_complex_math_fp64_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-complex-fp64.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/c99_complex_math_test.cpp b/sycl/test/devicelib/c99_complex_math_test.cpp index 8b28e943d6547..9637ccd4a0568 100644 --- a/sycl/test/devicelib/c99_complex_math_test.cpp +++ b/sycl/test/devicelib/c99_complex_math_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-complex.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/cmath_test.cpp b/sycl/test/devicelib/cmath_test.cpp index ded8d047a54d8..217ad4121f6c8 100644 --- a/sycl/test/devicelib/cmath_test.cpp +++ b/sycl/test/devicelib/cmath_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/cmath_test_fp64.cpp b/sycl/test/devicelib/cmath_test_fp64.cpp index 5ec7f1b34c24e..1c8b7afa5d4a5 100644 --- a/sycl/test/devicelib/cmath_test_fp64.cpp +++ b/sycl/test/devicelib/cmath_test_fp64.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/math_fp64_test.cpp b/sycl/test/devicelib/math_fp64_test.cpp index b0eedae1e8f7e..30c21dbc3b77e 100644 --- a/sycl/test/devicelib/math_fp64_test.cpp +++ b/sycl/test/devicelib/math_fp64_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/math_override_test.cpp b/sycl/test/devicelib/math_override_test.cpp index 92b419bfdca8b..62166b95ab1ab 100644 --- a/sycl/test/devicelib/math_override_test.cpp +++ b/sycl/test/devicelib/math_override_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/math_test.cpp b/sycl/test/devicelib/math_test.cpp index 4afba887681a2..1b5aa4332fe8e 100644 --- a/sycl/test/devicelib/math_test.cpp +++ b/sycl/test/devicelib/math_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out #include #include #include diff --git a/sycl/test/devicelib/std_complex_math_fp64_test.cpp b/sycl/test/devicelib/std_complex_math_fp64_test.cpp index b59591b578981..e1213a713ad5b 100644 --- a/sycl/test/devicelib/std_complex_math_fp64_test.cpp +++ b/sycl/test/devicelib/std_complex_math_fp64_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-complex-fp64.o %llvm_build_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out #include #include #include "math_utils.hpp" diff --git a/sycl/test/devicelib/std_complex_math_test.cpp b/sycl/test/devicelib/std_complex_math_test.cpp index 9c817e714a88f..f3fe5ae9ae510 100644 --- a/sycl/test/devicelib/std_complex_math_test.cpp +++ b/sycl/test/devicelib/std_complex_math_test.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows // RUN: %clangxx -fsycl -c %s -o %t.o -// RUN: %clangxx -fsycl %t.o %llvm_build_libs_dir/libsycl-complex.o %llvm_build_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o %sycl_libs_dir/libsycl-cmath.o -o %t.out #include #include #include "math_utils.hpp" diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index f4839e086efdb..999b6d1cd5584 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -38,21 +38,21 @@ config.available_features.add('linux') # Propagate 'LD_LIBRARY_PATH' through the environment. if 'LD_LIBRARY_PATH' in os.environ: - config.environment['LD_LIBRARY_PATH'] = os.path.pathsep.join((config.environment['LD_LIBRARY_PATH'], config.llvm_build_libs_dir)) + config.environment['LD_LIBRARY_PATH'] = os.path.pathsep.join((config.environment['LD_LIBRARY_PATH'], config.sycl_libs_dir)) else: - config.environment['LD_LIBRARY_PATH'] = config.llvm_build_libs_dir + config.environment['LD_LIBRARY_PATH'] = config.sycl_libs_dir elif platform.system() == "Windows": config.available_features.add('windows') if 'LIB' in os.environ: - config.environment['LIB'] = os.path.pathsep.join((config.environment['LIB'], config.llvm_build_libs_dir)) + config.environment['LIB'] = os.path.pathsep.join((config.environment['LIB'], config.sycl_libs_dir)) else: - config.environment['LIB'] = config.llvm_build_libs_dir + config.environment['LIB'] = config.sycl_libs_dir if 'PATH' in os.environ: - config.environment['PATH'] = os.path.pathsep.join((config.environment['PATH'], config.llvm_build_bins_dir)) + config.environment['PATH'] = os.path.pathsep.join((config.environment['PATH'], config.sycl_tools_dir)) else: - config.environment['PATH'] = config.llvm_build_bins_dir + config.environment['PATH'] = config.sycl_tools_dir elif platform.system() == "Darwin": # FIXME: surely there is a more elegant way to instantiate the Xcode directories. @@ -61,34 +61,31 @@ else: config.environment['CPATH'] = "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1" config.environment['CPATH'] = os.path.pathsep.join((config.environment['CPATH'], "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/")) - config.environment['DYLD_LIBRARY_PATH'] = config.llvm_build_libs_dir + config.environment['DYLD_LIBRARY_PATH'] = config.sycl_libs_dir # propagate the environment variable OCL_ICD_FILANEMES to use proper runtime. if 'OCL_ICD_FILENAMES' in os.environ: config.environment['OCL_ICD_FILENAMES'] = os.environ['OCL_ICD_FILENAMES'] -config.substitutions.append( ('%clang_cc1', ' ' + config.clang + ' -cc1 ') ) -config.substitutions.append( ('%clangxx', ' ' + config.clangxx ) ) -config.substitutions.append( ('%clang_cl', ' ' + config.clang_cl ) ) -config.substitutions.append( ('%clang', ' ' + config.clang ) ) -config.substitutions.append( ('%llvm_build_libs_dir', config.llvm_build_libs_dir ) ) +config.substitutions.append( ('%sycl_libs_dir', config.sycl_libs_dir ) ) config.substitutions.append( ('%sycl_include', config.sycl_include ) ) config.substitutions.append( ('%opencl_libs_dir', config.opencl_libs_dir) ) config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) ) +llvm_config.use_clang() + tools = ['llvm-spirv'] -tool_dirs = [config.llvm_tools_dir] +tool_dirs = [config.sycl_tools_dir] llvm_config.add_tool_substitutions(tools, tool_dirs) if "opencl-aot" in config.llvm_enable_projects: if 'PATH' in os.environ: print("Adding path to opencl-aot tool to PATH") - os.environ['PATH'] = os.path.pathsep.join((os.getenv('PATH'), config.llvm_build_bins_dir)) + os.environ['PATH'] = os.path.pathsep.join((os.getenv('PATH'), config.sycl_tools_dir)) backend=lit_config.params.get('SYCL_BE', "PI_OPENCL") -get_device_count_by_type_path = os.path.join(config.llvm_binary_dir, - "bin", "get_device_count_by_type") +get_device_count_by_type_path = os.path.join(config.llvm_tools_dir, "get_device_count_by_type") def getDeviceCount(device_type): is_cuda = False; @@ -186,7 +183,7 @@ def getDeviceCount(device_type): path = config.environment['PATH'] -path = os.path.pathsep.join((config.llvm_tools_dir, path)) +path = os.path.pathsep.join((config.sycl_tools_dir, path)) config.environment['PATH'] = path # Device AOT compilation tools aren't part of the SYCL project, diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in index f1a66894b1f0f..18ef72f3a94e7 100644 --- a/sycl/test/lit.site.cfg.py.in +++ b/sycl/test/lit.site.cfg.py.in @@ -2,18 +2,16 @@ import sys -config.clang = "@CLANG_IN_BUILD@" -config.clangxx = "@CLANGXX_IN_BUILD@" -config.clang_cl = "@CLANGCL_IN_BUILD@" config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" +config.sycl_tools_dir = lit_config.params.get('SYCL_TOOLS_DIR', "@LLVM_TOOLS_DIR@") config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" -config.llvm_build_libs_dir = "@LLVM_BUILD_LIBRARY_DIRS@" -config.llvm_build_bins_dir = "@LLVM_BUILD_BINARY_DIRS@" -config.llvm_binary_dir = "@LLVM_BINARY_DIR@" -config.sycl_include = "@SYCL_INCLUDE@" +config.sycl_include = lit_config.params.get('SYCL_INCLUDE', "@SYCL_INCLUDE@") config.sycl_obj_root = "@SYCL_BINARY_DIR@" config.sycl_source_dir = "@SYCL_SOURCE_DIR@/source" config.opencl_libs_dir = os.path.dirname("@OpenCL_LIBRARIES@") +config.sycl_libs_dir = lit_config.params.get('SYCL_LIBS_DIR', "@LLVM_LIBS_DIR@") +config.target_triple = "@TARGET_TRIPLE@" +config.host_triple = "@LLVM_HOST_TRIPLE@" config.llvm_enable_projects = "@LLVM_ENABLE_PROJECTS@" diff --git a/sycl/test/regression/sub-group-store-const-ref.cpp b/sycl/test/regression/sub-group-store-const-ref.cpp new file mode 100644 index 0000000000000..991f56f30a130 --- /dev/null +++ b/sycl/test/regression/sub-group-store-const-ref.cpp @@ -0,0 +1,16 @@ +// RUN: %clangxx -I %sycl_include -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics +// +//==-- sub-group-store-const-ref.cpp ---------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This test checks that sub_group::store supports const reference. +//===----------------------------------------------------------------------===// +#include +using namespace sycl; + +void test(intel::sub_group sg, global_ptr ptr) { sg.store(ptr, 1); } diff --git a/sycl/test/scheduler/CommandCleanupThreadSafety.cpp b/sycl/test/scheduler/CommandCleanupThreadSafety.cpp new file mode 100644 index 0000000000000..f1d39db4e8995 --- /dev/null +++ b/sycl/test/scheduler/CommandCleanupThreadSafety.cpp @@ -0,0 +1,40 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl %s -o %t.out -lpthread +// RUN: %CPU_RUN_PLACEHOLDER %t.out +#include + +#include +#include +#include +#include + +// This test checks that the command graph cleanup works properly when +// invoked from multiple threads. +using namespace cl::sycl; + +class Foo; + +event submitTask(queue &Q, buffer &Buf) { + return Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task([=]() { Acc[0] = 42; }); + }); +} + +int main() { + queue Q; + buffer Buf(range<1>(1)); + + // Create multiple commands, each one dependent on the previous + std::vector Events; + const std::size_t NTasks = 16; + for (std::size_t I = 0; I < NTasks; ++I) + Events.push_back(submitTask(Q, Buf)); + + // Initiate cleanup from multiple threads + std::vector Threads; + for (event &E : Events) + Threads.emplace_back([&]() { E.wait(); }); + for (std::thread &T : Threads) + T.join(); +} diff --git a/sycl/test/scheduler/FinishedCmdCleanup.cpp b/sycl/test/scheduler/FinishedCmdCleanup.cpp index e0f736886b040..cf036d81d65fa 100644 --- a/sycl/test/scheduler/FinishedCmdCleanup.cpp +++ b/sycl/test/scheduler/FinishedCmdCleanup.cpp @@ -1,6 +1,7 @@ // RUN: %clangxx -fsycl -I %sycl_source_dir %s -o %t.out // RUN: %t.out #include +#include #include #include @@ -76,7 +77,9 @@ int main() { addEdge(InnerA, &LeafA, &AllocaA); addEdge(InnerA, InnerB, &AllocaB); - TS.cleanupFinishedCommands(InnerA); + std::shared_ptr Event{new detail::event_impl{}}; + Event->setCommand(InnerA); + TS.cleanupFinishedCommands(Event); TS.removeRecordForMemObj(detail::getSyclObjImpl(BufC).get()); assert(NInnerCommandsAlive == 0); diff --git a/sycl/test/usm/memadvise.cpp b/sycl/test/usm/memadvise.cpp index a7e152b02d946..111519169c7ac 100644 --- a/sycl/test/usm/memadvise.cpp +++ b/sycl/test/usm/memadvise.cpp @@ -37,7 +37,7 @@ int main() { if (s_head == nullptr) { return -1; } - q.mem_advise(s_head, sizeof(Node), 42); + q.mem_advise(s_head, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY); Node *s_cur = s_head; for (int i = 0; i < numNodes; i++) { @@ -48,7 +48,7 @@ int main() { if (s_cur->pNext == nullptr) { return -1; } - q.mem_advise(s_cur->pNext, sizeof(Node), 42); + q.mem_advise(s_cur->pNext, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY); } else { s_cur->pNext = nullptr; } diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp index d854441088db3..3c4f8888a4bc2 100644 --- a/sycl/unittests/pi/cuda/test_base_objects.cpp +++ b/sycl/unittests/pi/cuda/test_base_objects.cpp @@ -73,7 +73,7 @@ TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreate) { ASSERT_EQ(cuErr, CUDA_SUCCESS); } -TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreatePrimary) { +TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreatePrimaryTrue) { pi_uint32 numPlatforms = 0; pi_platform platform; pi_device device; @@ -91,11 +91,12 @@ TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreatePrimary) { ASSERT_EQ((Plugins[0].call_nocheck( platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)), PI_SUCCESS); - cl_context_properties properties = PI_CONTEXT_PROPERTIES_CUDA_PRIMARY; + pi_context_properties properties[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, + PI_TRUE, 0}; pi_context ctxt; ASSERT_EQ((Plugins[0].call_nocheck( - &properties, 1, &device, nullptr, nullptr, &ctxt)), + properties, 1, &device, nullptr, nullptr, &ctxt)), PI_SUCCESS); EXPECT_NE(ctxt, nullptr); EXPECT_EQ(ctxt->get_device(), device); @@ -118,6 +119,52 @@ TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreatePrimary) { PI_SUCCESS); } +TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreatePrimaryFalse) { + pi_uint32 numPlatforms = 0; + pi_platform platform; + pi_device device; + + ASSERT_EQ((Plugins[0].call_nocheck( + 0, nullptr, &numPlatforms)), + PI_SUCCESS) + << "piPlatformsGet failed.\n"; + + ASSERT_EQ((Plugins[0].call_nocheck( + numPlatforms, &platform, nullptr)), + PI_SUCCESS) + << "piPlatformsGet failed.\n"; + + ASSERT_EQ((Plugins[0].call_nocheck( + platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)), + PI_SUCCESS); + pi_context_properties properties[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, + PI_FALSE, 0}; + + pi_context ctxt; + ASSERT_EQ((Plugins[0].call_nocheck( + properties, 1, &device, nullptr, nullptr, &ctxt)), + PI_SUCCESS); + EXPECT_NE(ctxt, nullptr); + EXPECT_EQ(ctxt->get_device(), device); + EXPECT_FALSE(ctxt->is_primary()); + + // Retrieve the cuCtxt to check information is correct + CUcontext cudaContext = ctxt->get(); + unsigned int version = 0; + CUresult cuErr = cuCtxGetApiVersion(cudaContext, &version); + ASSERT_EQ(cuErr, CUDA_SUCCESS); + EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION); + + // Current context in the stack? + CUcontext current; + cuErr = cuCtxGetCurrent(¤t); + ASSERT_EQ(cuErr, CUDA_SUCCESS); + ASSERT_EQ(current, cudaContext); + ASSERT_EQ( + (Plugins[0].call_nocheck(ctxt)), + PI_SUCCESS); +} + TEST_F(DISABLED_CudaBaseObjectsTest, piContextCreateChildThread) { pi_uint32 numPlatforms = 0; pi_platform platform;