From 2c31fa487e2b291e47ee2d588f2598d474dc5113 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Mon, 9 Jan 2023 08:50:26 -0800 Subject: [PATCH 01/23] [OpenCL] Add support for cslice partitioning Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/CMakeLists.txt | 3 + sycl/plugins/opencl/pi_opencl.cpp | 230 ++++++++++++++++++++++++++++- sycl/plugins/opencl/pi_opencl.hpp | 130 ++++++++++++++++ sycl/plugins/opencl/pi_utils.hpp | 79 ++++++++++ 4 files changed, 435 insertions(+), 7 deletions(-) create mode 100644 sycl/plugins/opencl/pi_utils.hpp diff --git a/sycl/plugins/opencl/CMakeLists.txt b/sycl/plugins/opencl/CMakeLists.txt index 9943827e48788..a6638192f487c 100644 --- a/sycl/plugins/opencl/CMakeLists.txt +++ b/sycl/plugins/opencl/CMakeLists.txt @@ -10,12 +10,15 @@ #TODO: Currently, the pi.h header is common between sycl and plugin library sources. #This can be changed by copying the pi.h file in the plugins project. +find_package(Threads REQUIRED) + add_sycl_plugin(opencl SOURCES "${sycl_inc_dir}/sycl/detail/pi.h" "pi_opencl.cpp" LIBRARIES OpenCL-ICD + Threads::Threads ) set_target_properties(pi_opencl PROPERTIES LINKER_LANGUAGE CXX) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index e71f23e0d2e4d..9ed1c18447aef 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -29,8 +30,21 @@ #include #include #include +#include #include +#include "pi_utils.hpp" + +static const bool ExposeCSliceInAffinityPartitioning = [] { + const char *Flag = + std::getenv("SYCL_PI_OPENCL_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING"); + return Flag ? std::atoi(Flag) != 0 : false; +}(); + +#define PI_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + #define CHECK_ERR_SET_NULL_RET(err, ptr, reterr) \ if (err != CL_SUCCESS) { \ if (ptr != nullptr) \ @@ -258,9 +272,37 @@ static pi_result USMSetIndirectAccess(pi_kernel kernel) { extern "C" { +// Return sub-device level +// 0 -> root device +// 1 -> sub-device +// 2 -> sub-sub-device (CCS) +// -1 -> invalid device +static int getSubLevel(pi_device device) { + if (!device) + return -1; + cl_device_id parentId = nullptr; + clGetDeviceInfo(cast(device), CL_DEVICE_PARENT_DEVICE, + sizeof(cl_device_id), &parentId, NULL); + if (parentId == nullptr) + return 0; + cl_device_id parentParentId = nullptr; + clGetDeviceInfo(parentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), + &parentParentId, NULL); + if (parentParentId == nullptr) + return 1; + cl_device_id parentParentParentId = nullptr; + clGetDeviceInfo(parentParentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), + &parentParentParentId, NULL); + if (parentParentParentId == nullptr) + return 2; + return -1; +} + pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) { + PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); + ReturnHelper ReturnValue(paramValueSize, paramValue, paramValueSizeRet); switch (paramName) { // TODO: Check regularly to see if support in enabled in OpenCL. // Intel GPU EU device-specific information extensions. @@ -342,6 +384,103 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return PI_SUCCESS; } + case PI_DEVICE_INFO_PARTITION_PROPERTIES: { + // SYCL spec says: if this SYCL device cannot be partitioned into at least + // two sub devices then the returned vector must be empty. + pi_uint32 partitionMaxSubDevices = 0; + if (device->subLevel == -1) + device->subLevel = getSubLevel(device); + if (device->isRootDevice()) { + clGetDeviceInfo( + cast(device), CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + sizeof(partitionMaxSubDevices), &partitionMaxSubDevices, nullptr); + } else if (device->isSubDevice()) { + // find out number of CCSes + bool supported = false; + cl_int ret_err = CL_SUCCESS; + ret_err = + checkDeviceExtensions(cast(device), + {"cl_intel_command_queue_families"}, supported); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + if (!supported) { + std::cout + << "This device does not support cl_intel_command_queue_families" + << std::endl; + return ReturnValue(pi_device_partition_property{0}); + } + cl_queue_family_properties_intel qfprops[3]; + size_t qsize = 0; + clGetDeviceInfo( + cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, + 3*sizeof(cl_queue_family_properties_intel), qfprops, &qsize); + qsize = qsize/sizeof(cl_queue_family_properties_intel); + for ( size_t q = 0; q < qsize; q++ ) { + if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && + qfprops[q].count > partitionMaxSubDevices) { + partitionMaxSubDevices = qfprops[q].count; + } + } + } else { + return ReturnValue(pi_device_partition_property{0}); + } + + auto ReturnHelper = [&](auto... Partitions) { + struct { + pi_device_partition_property Arr[sizeof...(Partitions) + 1]; + } PartitionProperties = {{Partitions..., 0}}; + return ReturnValue(PartitionProperties); + }; + + // Root device + if (device->subLevel == -1) + device->subLevel = getSubLevel(device); + if (device->isRootDevice()) { + if (partitionMaxSubDevices < 2) { + return ReturnValue(pi_device_partition_property{0}); + } + return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } else if (device->isSubDevice()) { + if (partitionMaxSubDevices < 2) { + return ReturnValue(pi_device_partition_property{0}); + } + if (ExposeCSliceInAffinityPartitioning) { + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } else { + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); + } + } else { + return ReturnValue(pi_device_partition_property{0}); + } + } + case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: + return ReturnValue(pi_device_affinity_domain{ + PI_DEVICE_AFFINITY_DOMAIN_NUMA | + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); + case PI_DEVICE_INFO_PARTITION_TYPE: { + if (device->subLevel == -1) + device->subLevel = getSubLevel(device); + // For root-device there is no partitioning to report. + if (device->isRootDevice()) + return ReturnValue(pi_device_partition_property{0}); + if (device->isSubDevice()) { + struct { + pi_device_partition_property Arr[3]; + } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, + 0}}; + return ReturnValue(PartitionProperties); + } + if (device->isSubSubDevice()) { + struct { + pi_device_partition_property Arr[2]; + } PartitionProperties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; + return ReturnValue(PartitionProperties); + } + return ReturnValue(pi_device_partition_property{0}); + } + default: cl_int result = clGetDeviceInfo( cast(device), cast(paramName), @@ -350,6 +489,65 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, } } +pi_result piDevicePartition(pi_device device, + const pi_device_partition_property *properties, + pi_uint32 num_devices, pi_device *out_devices, + pi_uint32 *out_num_devices) { + cl_int result = CL_DEVICE_NOT_FOUND; + if (device->subLevel == -1) + device->subLevel = getSubLevel(device); + // For root-device there is no partitioning to report. + if (device->isRootDevice()) { + result = clCreateSubDevices( + cast(device), + cast(properties), + cast(num_devices), cast(out_devices), + cast(out_num_devices)); + if (out_devices) { + for (uint32_t i = 0; i < *out_num_devices; ++i) { + out_devices[i]->subLevel = device->subLevel + 1; + } + } + } else if (device->isSubDevice()) { + cl_queue_family_properties_intel qfprops[3]; + size_t qsize = 0; + cl_int family = -1; + cl_uint partitionMaxSubDevices = 0; + clGetDeviceInfo( + cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, + 3*sizeof(cl_queue_family_properties_intel), qfprops, &qsize); + qsize = qsize/sizeof(cl_queue_family_properties_intel); + for ( size_t q = 0; q < qsize; q++ ) { + if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && + qfprops[q].count > partitionMaxSubDevices) { + family = q; + partitionMaxSubDevices = qfprops[q].count; + } + } + *out_num_devices = partitionMaxSubDevices; + if (out_devices) { + for (uint32_t i = 0; i < *out_num_devices; ++i) { + pi_device cloneDevice(device); + out_devices[i] = cloneDevice; + } + for (uint32_t i = 0; i < *out_num_devices; ++i) { + out_devices[i]->subLevel = device->subLevel + 1; + out_devices[i]->family = family; + out_devices[i]->index = i % (*out_num_devices); + } + } + return PI_SUCCESS; + } + // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in out_num_devices + if (result == CL_DEVICE_NOT_FOUND) { + std::cout << "Device not found\n"; + assert(out_num_devices != 0); + *out_num_devices = 0; + return PI_SUCCESS; + } + return cast(result); +} + pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms) { cl_int result = clGetPlatformIDs(cast(num_entries), @@ -381,6 +579,9 @@ pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, cast(num_entries), cast(devices), cast(num_devices)); + for (pi_uint32 i = 0; i < num_entries; ++i) { + devices[i]->subLevel = 0; + } // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in num_devices if (result == CL_DEVICE_NOT_FOUND) { assert(num_devices != 0); @@ -482,6 +683,7 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device, return PI_ERROR_INVALID_VALUE; return piQueueCreate(Context, Device, Flags, Queue); } + pi_result piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue) { assert(queue && "piQueueCreate failed, queue argument is null"); @@ -518,12 +720,26 @@ pi_result piQueueCreate(pi_context context, pi_device device, return cast(ret_err); } - cl_queue_properties CreationFlagProperties[] = { - CL_QUEUE_PROPERTIES, - cast(properties) & SupportByOpenCL, 0}; - *queue = cast(clCreateCommandQueueWithProperties( - cast(context), cast(device), - CreationFlagProperties, &ret_err)); + if (device->subLevel == 2) { + cl_queue_properties CreationFlagProperties[] = { + CL_QUEUE_PROPERTIES, + cast(properties) & SupportByOpenCL, + CL_QUEUE_FAMILY_INTEL, + device->family, + CL_QUEUE_INDEX_INTEL, + device->index, + 0}; + *queue = cast(clCreateCommandQueueWithProperties( + cast(context), cast(device), + CreationFlagProperties, &ret_err)); + } else { + cl_queue_properties CreationFlagProperties[] = { + CL_QUEUE_PROPERTIES, + cast(properties) & SupportByOpenCL, 0}; + *queue = cast(clCreateCommandQueueWithProperties( + cast(context), cast(device), + CreationFlagProperties, &ret_err)); + } return cast(ret_err); } @@ -1689,7 +1905,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { // Device _PI_CL(piDevicesGet, piDevicesGet) _PI_CL(piDeviceGetInfo, piDeviceGetInfo) - _PI_CL(piDevicePartition, clCreateSubDevices) + _PI_CL(piDevicePartition, piDevicePartition) _PI_CL(piDeviceRetain, clRetainDevice) _PI_CL(piDeviceRelease, clReleaseDevice) _PI_CL(piextDeviceSelectBinary, piextDeviceSelectBinary) diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index 7835df8c4cb6e..da931687f7af4 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -17,9 +17,13 @@ #ifndef PI_OPENCL_HPP #define PI_OPENCL_HPP +#include #include +#include #include +#include #include +#include // This version should be incremented for any change made to this file or its // corresponding .cpp file. @@ -28,6 +32,111 @@ #define _PI_OPENCL_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_OPENCL_PLUGIN_VERSION) +// A single-threaded app has an opportunity to enable this mode to avoid +// overhead from mutex locking. Default value is 0 which means that single +// thread mode is disabled. +static const bool SingleThreadMode = [] { + const char *Ret = std::getenv("SYCL_PI_OPENCL_SINGLE_THREAD_MODE"); + const bool RetVal = Ret ? std::stoi(Ret) : 0; + return RetVal; +}(); + +// Class which acts like shared_mutex if SingleThreadMode variable is not set. +// If SingleThreadMode variable is set then mutex operations are turned into +// nop. +class pi_shared_mutex_ocl { + std::shared_mutex Mutex; + +public: + void lock() { + if (!SingleThreadMode) + Mutex.lock(); + } + bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } + void unlock() { + if (!SingleThreadMode) + Mutex.unlock(); + } + + void lock_shared() { + if (!SingleThreadMode) + Mutex.lock_shared(); + } + bool try_lock_shared() { + return SingleThreadMode ? true : Mutex.try_lock_shared(); + } + void unlock_shared() { + if (!SingleThreadMode) + Mutex.unlock_shared(); + } +}; + +// This wrapper around std::atomic is created to limit operations with reference +// counter and to make allowed operations more transparent in terms of +// thread-safety in the plugin. increment() and load() operations do not need a +// mutex guard around them since the underlying data is already atomic. +// decrementAndTest() method is used to guard a code which needs to be +// executed when object's ref count becomes zero after release. This method also +// doesn't need a mutex guard because decrement operation is atomic and only one +// thread can reach ref count equal to zero, i.e. only a single thread can pass +// through this check. +struct ReferenceCounter { + ReferenceCounter() : RefCount{1} {} + + // Reset the counter to the initial value. + void reset() { RefCount = 1; } + + // Used when retaining an object. + void increment() { RefCount++; } + + // Supposed to be used in pi*GetInfo* methods where ref count value is + // requested. + uint32_t load() { return RefCount.load(); } + + // This method allows to guard a code which needs to be executed when object's + // ref count becomes zero after release. It is important to notice that only a + // single thread can pass through this check. This is true because of several + // reasons: + // 1. Decrement operation is executed atomically. + // 2. It is not allowed to retain an object after its refcount reaches zero. + // 3. It is not allowed to release an object more times than the value of + // the ref count. + // 2. and 3. basically means that we can't use an object at all as soon as its + // refcount reaches zero. Using this check guarantees that code for deleting + // an object and releasing its resources is executed once by a single thread + // and we don't need to use any mutexes to guard access to this object in the + // scope after this check. Of course if we access another objects in this code + // (not the one which is being deleted) then access to these objects must be + // guarded, for example with a mutex. + bool decrementAndTest() { return --RefCount == 0; } + +private: + std::atomic RefCount; +}; + +// Base class to store common data +struct _pi_object { + _pi_object() : RefCount{} {} + + // Must be atomic to prevent data race when incrementing/decrementing. + ReferenceCounter RefCount; + + // This mutex protects accesses to all the non-const member variables. + // Exclusive access is required to modify any of these members. + // + // To get shared access to the object in a scope use std::shared_lock: + // std::shared_lock Lock(Obj->Mutex); + // To get exclusive access to the object in a scope use std::scoped_lock: + // std::scoped_lock Lock(Obj->Mutex); + // + // If several pi objects are accessed in a scope then each object's mutex must + // be locked. For example, to get write access to Obj1 and Obj2 and read + // access to Obj3 in a scope use the following approach: + // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock); + // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock); + pi_shared_mutex_ocl Mutex; +}; + namespace OCLV { class OpenCLVersion { protected: @@ -115,4 +224,25 @@ inline const OpenCLVersion V3_0(3, 0); } // namespace OCLV +// Define the types that are opaque in pi.h in a manner suitable for OpenCL +// plugin + +struct _pi_device : _pi_object { + _pi_device(pi_platform Plt) : Platform{Plt} { + subLevel = family = index = -1; + // NOTE: one must additionally call initialize() to complete + // PI device creation. + } + // PI platform to which this device belongs. + pi_platform Platform; + + // Info stored for sub-sub device queue creation + int subLevel; // 0 - root device; 1 - sub-device; 2 - sub-sub-device + pi_int32 family; // SYCL queue family + pi_int32 index; // SYCL queue index inside a given family of queues + bool isRootDevice(void) { return subLevel == 0; } + bool isSubDevice(void) { return subLevel == 1; } + bool isSubSubDevice(void) { return subLevel == 2; } +}; + #endif // PI_OPENCL_HPP diff --git a/sycl/plugins/opencl/pi_utils.hpp b/sycl/plugins/opencl/pi_utils.hpp new file mode 100644 index 0000000000000..e6330144c7b92 --- /dev/null +++ b/sycl/plugins/opencl/pi_utils.hpp @@ -0,0 +1,79 @@ +//===--------- pi_utils.hpp - Plugin Utility Functions -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#ifndef PI_UTILS_HPP +#define PI_UTILS_HPP + +// This version should be incremented for any change made to this file or its +// corresponding .cpp file. +#define _PI_LEVEL_ZERO_PLUGIN_VERSION 1 + +#define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \ + _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION) + +#define ARG_UNUSED(x) (void)x + +namespace { + +// Helper functions for unified 'Return' type declaration - imported +// from pi_level_zero.cpp + +template +pi_result getInfoImpl(size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet, T Value, size_t ValueSize, + Assign &&AssignFunc) { + if (ParamValue != nullptr) { + if (ParamValueSize < ValueSize) { + return PI_ERROR_INVALID_VALUE; + } + AssignFunc(ParamValue, Value, ValueSize); + } + if (ParamValueSizeRet != nullptr) { + *ParamValueSizeRet = ValueSize; + } + return PI_SUCCESS; +} + +template +pi_result getInfo(size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet, T Value) { + auto assignment = [](void *ParamValue, T Value, size_t ValueSize) { + ARG_UNUSED(ValueSize); + *static_cast(ParamValue) = Value; + }; + return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, + sizeof(T), assignment); +} + +template +pi_result getInfoArray(size_t ArrayLength, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet, T *Value) { + return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, + ArrayLength * sizeof(T), memcpy); +} + +class ReturnHelper { +public: + ReturnHelper(size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + : param_value_size(param_value_size), param_value(param_value), + param_value_size_ret(param_value_size_ret) {} + + template pi_result operator()(const T &t) { + return getInfo(param_value_size, param_value, param_value_size_ret, t); + } + +private: + size_t param_value_size; + void *param_value; + size_t *param_value_size_ret; +}; + +} // anonymous namespace + +#endif // PI_UTILS_HPP From 3ba4ade7a6a6b013280df8deaf198c68a1c90535 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Mon, 9 Jan 2023 09:15:11 -0800 Subject: [PATCH 02/23] Fix type issue Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 2 +- sycl/plugins/opencl/pi_opencl.hpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 9ed1c18447aef..a330891ca67a2 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -511,7 +511,7 @@ pi_result piDevicePartition(pi_device device, } else if (device->isSubDevice()) { cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; - cl_int family = -1; + pi_uint32 family = 0; cl_uint partitionMaxSubDevices = 0; clGetDeviceInfo( cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index da931687f7af4..b61e21acfbd57 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -229,7 +229,8 @@ inline const OpenCLVersion V3_0(3, 0); struct _pi_device : _pi_object { _pi_device(pi_platform Plt) : Platform{Plt} { - subLevel = family = index = -1; + subLevel = -1; + family = index = 0; // NOTE: one must additionally call initialize() to complete // PI device creation. } @@ -237,9 +238,9 @@ struct _pi_device : _pi_object { pi_platform Platform; // Info stored for sub-sub device queue creation - int subLevel; // 0 - root device; 1 - sub-device; 2 - sub-sub-device - pi_int32 family; // SYCL queue family - pi_int32 index; // SYCL queue index inside a given family of queues + int subLevel; // 0 - root device; 1 - sub-device; 2 - sub-sub-device + pi_uint32 family; // SYCL queue family + pi_uint32 index; // SYCL queue index inside a given family of queues bool isRootDevice(void) { return subLevel == 0; } bool isSubDevice(void) { return subLevel == 1; } bool isSubSubDevice(void) { return subLevel == 2; } From f7ce8f6a3f65a11317e540a6a054a5a524696b86 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Mon, 9 Jan 2023 11:55:38 -0800 Subject: [PATCH 03/23] Fix test fail Signed-off-by: Arvind Sudarsanam --- sycl/test/abi/pi_opencl_symbol_check.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 0f7b53629196a..4db01abfc7286 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -10,6 +10,7 @@ piContextCreate piContextGetInfo piDeviceGetInfo +piDevicePartition piDevicesGet piEnqueueMemBufferMap piEventCreate From 9600fe68259b56472a804fadc854136cf0130673 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Mon, 9 Jan 2023 13:21:12 -0800 Subject: [PATCH 04/23] Addressing review comments Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 160 +++++++++++++++--------------- sycl/plugins/opencl/pi_opencl.hpp | 17 +++- 2 files changed, 94 insertions(+), 83 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index a330891ca67a2..3cdb3ce38abd7 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -272,30 +272,42 @@ static pi_result USMSetIndirectAccess(pi_kernel kernel) { extern "C" { -// Return sub-device level -// 0 -> root device -// 1 -> sub-device -// 2 -> sub-sub-device (CCS) -// -1 -> invalid device -static int getSubLevel(pi_device device) { +// Helper function +static _pi_device::device_level nextLevel(const _pi_device::device_level currLevel) { + switch (currLevel) { + case _pi_device::ROOTDEVICE : + return _pi_device::SUBDEVICE; + case _pi_device::SUBDEVICE: + return _pi_device::SUBSUBDEVICE; + default: + return _pi_device::INVALID; + } +} + +// Return device level +// _pi_device::ROOTDEVICE -> root device +// _pi_device::SUBDEVICE -> sub-device +// _pi_device::SUBSUBDEVICE -> sub-sub-device (CCS) +// _pi_device::INVALID -> invalid device +_pi_device::device_level getLevel(pi_device device) { if (!device) - return -1; + return _pi_device::INVALID; cl_device_id parentId = nullptr; clGetDeviceInfo(cast(device), CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), &parentId, NULL); if (parentId == nullptr) - return 0; + return _pi_device::ROOTDEVICE; cl_device_id parentParentId = nullptr; clGetDeviceInfo(parentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), &parentParentId, NULL); if (parentParentId == nullptr) - return 1; + return _pi_device::SUBDEVICE; cl_device_id parentParentParentId = nullptr; clGetDeviceInfo(parentParentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), &parentParentParentId, NULL); if (parentParentParentId == nullptr) - return 2; - return -1; + return _pi_device::SUBSUBDEVICE; + return _pi_device::INVALID; } pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, @@ -387,43 +399,9 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, case PI_DEVICE_INFO_PARTITION_PROPERTIES: { // SYCL spec says: if this SYCL device cannot be partitioned into at least // two sub devices then the returned vector must be empty. - pi_uint32 partitionMaxSubDevices = 0; - if (device->subLevel == -1) - device->subLevel = getSubLevel(device); - if (device->isRootDevice()) { - clGetDeviceInfo( - cast(device), CL_DEVICE_PARTITION_MAX_SUB_DEVICES, - sizeof(partitionMaxSubDevices), &partitionMaxSubDevices, nullptr); - } else if (device->isSubDevice()) { - // find out number of CCSes - bool supported = false; - cl_int ret_err = CL_SUCCESS; - ret_err = - checkDeviceExtensions(cast(device), - {"cl_intel_command_queue_families"}, supported); - if (ret_err != CL_SUCCESS) - return static_cast(ret_err); - if (!supported) { - std::cout - << "This device does not support cl_intel_command_queue_families" - << std::endl; - return ReturnValue(pi_device_partition_property{0}); - } - cl_queue_family_properties_intel qfprops[3]; - size_t qsize = 0; - clGetDeviceInfo( - cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, - 3*sizeof(cl_queue_family_properties_intel), qfprops, &qsize); - qsize = qsize/sizeof(cl_queue_family_properties_intel); - for ( size_t q = 0; q < qsize; q++ ) { - if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && - qfprops[q].count > partitionMaxSubDevices) { - partitionMaxSubDevices = qfprops[q].count; - } - } - } else { - return ReturnValue(pi_device_partition_property{0}); - } + pi_uint32 numSubDevices = 0; + if (device->level == _pi_device::INVALID) // level not yet updated + device->level = getLevel(device); auto ReturnHelper = [&](auto... Partitions) { struct { @@ -432,26 +410,54 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return ReturnValue(PartitionProperties); }; - // Root device - if (device->subLevel == -1) - device->subLevel = getSubLevel(device); - if (device->isRootDevice()) { - if (partitionMaxSubDevices < 2) { - return ReturnValue(pi_device_partition_property{0}); - } - return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } else if (device->isSubDevice()) { - if (partitionMaxSubDevices < 2) { - return ReturnValue(pi_device_partition_property{0}); + switch (device->level) { + case _pi_device::ROOTDEVICE: { + clGetDeviceInfo( + cast(device), CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + sizeof(numSubDevices), &numSubDevices, nullptr); + if (numSubDevices < 2) { + return ReturnValue(pi_device_partition_property{0}); + } + return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); } - if (ExposeCSliceInAffinityPartitioning) { - return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, - PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } else { + case _pi_device::SUBDEVICE: { + // find out number of CCSes + bool supported = false; + cl_int ret_err = CL_SUCCESS; + ret_err = + checkDeviceExtensions(cast(device), + {"cl_intel_command_queue_families"}, supported); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + if (!supported) { + std::cout + << "This device does not support cl_intel_command_queue_families" + << std::endl; + return ReturnValue(pi_device_partition_property{0}); + } + cl_queue_family_properties_intel qfprops[3]; + size_t qsize = 0; + clGetDeviceInfo( + cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, + sizeof(qfprops), qfprops, &qsize); + qsize = qsize / sizeof(cl_queue_family_properties_intel); + for (size_t q = 0; q < qsize; q++) { + if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && + qfprops[q].count > numSubDevices) { + numSubDevices = qfprops[q].count; + } + } + if (numSubDevices < 2) { + return ReturnValue(pi_device_partition_property{0}); + } + if (ExposeCSliceInAffinityPartitioning) { + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); } - } else { - return ReturnValue(pi_device_partition_property{0}); + default: + return ReturnValue(pi_device_partition_property{0}); } } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: @@ -459,8 +465,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { - if (device->subLevel == -1) - device->subLevel = getSubLevel(device); + if (device->level == _pi_device::INVALID) // level not updated yet + device->level = getLevel(device); // For root-device there is no partitioning to report. if (device->isRootDevice()) return ReturnValue(pi_device_partition_property{0}); @@ -494,8 +500,8 @@ pi_result piDevicePartition(pi_device device, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices) { cl_int result = CL_DEVICE_NOT_FOUND; - if (device->subLevel == -1) - device->subLevel = getSubLevel(device); + if (device->level == _pi_device::INVALID) // level not updated yet + device->level = getLevel(device); // For root-device there is no partitioning to report. if (device->isRootDevice()) { result = clCreateSubDevices( @@ -505,7 +511,7 @@ pi_result piDevicePartition(pi_device device, cast(out_num_devices)); if (out_devices) { for (uint32_t i = 0; i < *out_num_devices; ++i) { - out_devices[i]->subLevel = device->subLevel + 1; + out_devices[i]->level = nextLevel(device->level); } } } else if (device->isSubDevice()) { @@ -515,9 +521,9 @@ pi_result piDevicePartition(pi_device device, cl_uint partitionMaxSubDevices = 0; clGetDeviceInfo( cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, - 3*sizeof(cl_queue_family_properties_intel), qfprops, &qsize); - qsize = qsize/sizeof(cl_queue_family_properties_intel); - for ( size_t q = 0; q < qsize; q++ ) { + sizeof(qfprops), qfprops, &qsize); + qsize = qsize / sizeof(cl_queue_family_properties_intel); + for (size_t q = 0; q < qsize; q++) { if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && qfprops[q].count > partitionMaxSubDevices) { family = q; @@ -529,9 +535,7 @@ pi_result piDevicePartition(pi_device device, for (uint32_t i = 0; i < *out_num_devices; ++i) { pi_device cloneDevice(device); out_devices[i] = cloneDevice; - } - for (uint32_t i = 0; i < *out_num_devices; ++i) { - out_devices[i]->subLevel = device->subLevel + 1; + out_devices[i]->level = nextLevel(device->level); out_devices[i]->family = family; out_devices[i]->index = i % (*out_num_devices); } @@ -580,7 +584,7 @@ pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, cast(num_devices)); for (pi_uint32 i = 0; i < num_entries; ++i) { - devices[i]->subLevel = 0; + devices[i]->level = _pi_device::ROOTDEVICE; } // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in num_devices if (result == CL_DEVICE_NOT_FOUND) { @@ -720,7 +724,7 @@ pi_result piQueueCreate(pi_context context, pi_device device, return cast(ret_err); } - if (device->subLevel == 2) { + if (device->level == _pi_device::SUBSUBDEVICE) { cl_queue_properties CreationFlagProperties[] = { CL_QUEUE_PROPERTIES, cast(properties) & SupportByOpenCL, diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index b61e21acfbd57..c752610f8b51a 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -228,8 +228,14 @@ inline const OpenCLVersion V3_0(3, 0); // plugin struct _pi_device : _pi_object { + enum device_level { + ROOTDEVICE = 0, + SUBDEVICE = 1, + SUBSUBDEVICE = 2, + INVALID = -1 + }; _pi_device(pi_platform Plt) : Platform{Plt} { - subLevel = -1; + level = INVALID; family = index = 0; // NOTE: one must additionally call initialize() to complete // PI device creation. @@ -238,12 +244,13 @@ struct _pi_device : _pi_object { pi_platform Platform; // Info stored for sub-sub device queue creation - int subLevel; // 0 - root device; 1 - sub-device; 2 - sub-sub-device + device_level level; pi_uint32 family; // SYCL queue family pi_uint32 index; // SYCL queue index inside a given family of queues - bool isRootDevice(void) { return subLevel == 0; } - bool isSubDevice(void) { return subLevel == 1; } - bool isSubSubDevice(void) { return subLevel == 2; } + + bool isRootDevice(void) { return level == ROOTDEVICE; } + bool isSubDevice(void) { return level == SUBDEVICE; } + bool isSubSubDevice(void) { return level == SUBSUBDEVICE; } }; #endif // PI_OPENCL_HPP From 59da75e1e7d07da9995a30dc74962f7fd0957bfa Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 19 Jan 2023 19:04:13 -0800 Subject: [PATCH 05/23] Address review comments Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/CMakeLists.txt | 7 ++ sycl/plugins/opencl/pi_opencl.cpp | 22 +----- sycl/plugins/opencl/pi_opencl.hpp | 109 +---------------------------- sycl/plugins/opencl/pi_utils.hpp | 79 --------------------- 4 files changed, 10 insertions(+), 207 deletions(-) delete mode 100644 sycl/plugins/opencl/pi_utils.hpp diff --git a/sycl/plugins/opencl/CMakeLists.txt b/sycl/plugins/opencl/CMakeLists.txt index a6638192f487c..f516dee7a2574 100644 --- a/sycl/plugins/opencl/CMakeLists.txt +++ b/sycl/plugins/opencl/CMakeLists.txt @@ -14,11 +14,18 @@ find_package(Threads REQUIRED) add_sycl_plugin(opencl SOURCES + "../unified_runtime/pi2ur.hpp" + "../unified_runtime/pi2ur.cpp" + "../unified_runtime/ur/ur.hpp" + "../unified_runtime/ur/ur.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "pi_opencl.cpp" + INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime # for Unified Runtime LIBRARIES OpenCL-ICD Threads::Threads + UnifiedRuntime-Headers ) set_target_properties(pi_opencl PROPERTIES LINKER_LANGUAGE CXX) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 3cdb3ce38abd7..6383dc0196769 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -33,14 +33,6 @@ #include #include -#include "pi_utils.hpp" - -static const bool ExposeCSliceInAffinityPartitioning = [] { - const char *Flag = - std::getenv("SYCL_PI_OPENCL_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING"); - return Flag ? std::atoi(Flag) != 0 : false; -}(); - #define PI_ASSERT(condition, error) \ if (!(condition)) \ return error; @@ -429,12 +421,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, {"cl_intel_command_queue_families"}, supported); if (ret_err != CL_SUCCESS) return static_cast(ret_err); - if (!supported) { - std::cout - << "This device does not support cl_intel_command_queue_families" - << std::endl; + if (!supported) return ReturnValue(pi_device_partition_property{0}); - } cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; clGetDeviceInfo( @@ -450,10 +438,6 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, if (numSubDevices < 2) { return ReturnValue(pi_device_partition_property{0}); } - if (ExposeCSliceInAffinityPartitioning) { - return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, - PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); } default: @@ -540,11 +524,9 @@ pi_result piDevicePartition(pi_device device, out_devices[i]->index = i % (*out_num_devices); } } - return PI_SUCCESS; } - // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in out_num_devices + // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in out_num_devices. if (result == CL_DEVICE_NOT_FOUND) { - std::cout << "Device not found\n"; assert(out_num_devices != 0); *out_num_devices = 0; return PI_SUCCESS; diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index c752610f8b51a..66fc11b123220 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -24,7 +24,7 @@ #include #include #include - +#include // This version should be incremented for any change made to this file or its // corresponding .cpp file. #define _PI_OPENCL_PLUGIN_VERSION 1 @@ -32,111 +32,6 @@ #define _PI_OPENCL_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_OPENCL_PLUGIN_VERSION) -// A single-threaded app has an opportunity to enable this mode to avoid -// overhead from mutex locking. Default value is 0 which means that single -// thread mode is disabled. -static const bool SingleThreadMode = [] { - const char *Ret = std::getenv("SYCL_PI_OPENCL_SINGLE_THREAD_MODE"); - const bool RetVal = Ret ? std::stoi(Ret) : 0; - return RetVal; -}(); - -// Class which acts like shared_mutex if SingleThreadMode variable is not set. -// If SingleThreadMode variable is set then mutex operations are turned into -// nop. -class pi_shared_mutex_ocl { - std::shared_mutex Mutex; - -public: - void lock() { - if (!SingleThreadMode) - Mutex.lock(); - } - bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } - void unlock() { - if (!SingleThreadMode) - Mutex.unlock(); - } - - void lock_shared() { - if (!SingleThreadMode) - Mutex.lock_shared(); - } - bool try_lock_shared() { - return SingleThreadMode ? true : Mutex.try_lock_shared(); - } - void unlock_shared() { - if (!SingleThreadMode) - Mutex.unlock_shared(); - } -}; - -// This wrapper around std::atomic is created to limit operations with reference -// counter and to make allowed operations more transparent in terms of -// thread-safety in the plugin. increment() and load() operations do not need a -// mutex guard around them since the underlying data is already atomic. -// decrementAndTest() method is used to guard a code which needs to be -// executed when object's ref count becomes zero after release. This method also -// doesn't need a mutex guard because decrement operation is atomic and only one -// thread can reach ref count equal to zero, i.e. only a single thread can pass -// through this check. -struct ReferenceCounter { - ReferenceCounter() : RefCount{1} {} - - // Reset the counter to the initial value. - void reset() { RefCount = 1; } - - // Used when retaining an object. - void increment() { RefCount++; } - - // Supposed to be used in pi*GetInfo* methods where ref count value is - // requested. - uint32_t load() { return RefCount.load(); } - - // This method allows to guard a code which needs to be executed when object's - // ref count becomes zero after release. It is important to notice that only a - // single thread can pass through this check. This is true because of several - // reasons: - // 1. Decrement operation is executed atomically. - // 2. It is not allowed to retain an object after its refcount reaches zero. - // 3. It is not allowed to release an object more times than the value of - // the ref count. - // 2. and 3. basically means that we can't use an object at all as soon as its - // refcount reaches zero. Using this check guarantees that code for deleting - // an object and releasing its resources is executed once by a single thread - // and we don't need to use any mutexes to guard access to this object in the - // scope after this check. Of course if we access another objects in this code - // (not the one which is being deleted) then access to these objects must be - // guarded, for example with a mutex. - bool decrementAndTest() { return --RefCount == 0; } - -private: - std::atomic RefCount; -}; - -// Base class to store common data -struct _pi_object { - _pi_object() : RefCount{} {} - - // Must be atomic to prevent data race when incrementing/decrementing. - ReferenceCounter RefCount; - - // This mutex protects accesses to all the non-const member variables. - // Exclusive access is required to modify any of these members. - // - // To get shared access to the object in a scope use std::shared_lock: - // std::shared_lock Lock(Obj->Mutex); - // To get exclusive access to the object in a scope use std::scoped_lock: - // std::scoped_lock Lock(Obj->Mutex); - // - // If several pi objects are accessed in a scope then each object's mutex must - // be locked. For example, to get write access to Obj1 and Obj2 and read - // access to Obj3 in a scope use the following approach: - // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock); - // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock); - pi_shared_mutex_ocl Mutex; -}; - namespace OCLV { class OpenCLVersion { protected: @@ -237,8 +132,6 @@ struct _pi_device : _pi_object { _pi_device(pi_platform Plt) : Platform{Plt} { level = INVALID; family = index = 0; - // NOTE: one must additionally call initialize() to complete - // PI device creation. } // PI platform to which this device belongs. pi_platform Platform; diff --git a/sycl/plugins/opencl/pi_utils.hpp b/sycl/plugins/opencl/pi_utils.hpp deleted file mode 100644 index e6330144c7b92..0000000000000 --- a/sycl/plugins/opencl/pi_utils.hpp +++ /dev/null @@ -1,79 +0,0 @@ -//===--------- pi_utils.hpp - Plugin Utility Functions -------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-----------------------------------------------------------------===// - -#ifndef PI_UTILS_HPP -#define PI_UTILS_HPP - -// This version should be incremented for any change made to this file or its -// corresponding .cpp file. -#define _PI_LEVEL_ZERO_PLUGIN_VERSION 1 - -#define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \ - _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION) - -#define ARG_UNUSED(x) (void)x - -namespace { - -// Helper functions for unified 'Return' type declaration - imported -// from pi_level_zero.cpp - -template -pi_result getInfoImpl(size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet, T Value, size_t ValueSize, - Assign &&AssignFunc) { - if (ParamValue != nullptr) { - if (ParamValueSize < ValueSize) { - return PI_ERROR_INVALID_VALUE; - } - AssignFunc(ParamValue, Value, ValueSize); - } - if (ParamValueSizeRet != nullptr) { - *ParamValueSizeRet = ValueSize; - } - return PI_SUCCESS; -} - -template -pi_result getInfo(size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet, T Value) { - auto assignment = [](void *ParamValue, T Value, size_t ValueSize) { - ARG_UNUSED(ValueSize); - *static_cast(ParamValue) = Value; - }; - return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, - sizeof(T), assignment); -} - -template -pi_result getInfoArray(size_t ArrayLength, size_t ParamValueSize, - void *ParamValue, size_t *ParamValueSizeRet, T *Value) { - return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, - ArrayLength * sizeof(T), memcpy); -} - -class ReturnHelper { -public: - ReturnHelper(size_t param_value_size, void *param_value, - size_t *param_value_size_ret) - : param_value_size(param_value_size), param_value(param_value), - param_value_size_ret(param_value_size_ret) {} - - template pi_result operator()(const T &t) { - return getInfo(param_value_size, param_value, param_value_size_ret, t); - } - -private: - size_t param_value_size; - void *param_value; - size_t *param_value_size_ret; -}; - -} // anonymous namespace - -#endif // PI_UTILS_HPP From 59f5741731c9f794d6335b9f7de704340a3bcf3b Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 19 Jan 2023 19:06:51 -0800 Subject: [PATCH 06/23] Minor format issue Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index 66fc11b123220..2e69ad45078b8 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -20,11 +20,11 @@ #include #include #include +#include #include #include #include #include -#include // This version should be incremented for any change made to this file or its // corresponding .cpp file. #define _PI_OPENCL_PLUGIN_VERSION 1 From 27d23b1ad48d8706a75e8faeee5a8feae19fd557 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 19 Jan 2023 19:36:19 -0800 Subject: [PATCH 07/23] Minor changes Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 6383dc0196769..ba15ce61a91da 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -413,7 +413,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); } case _pi_device::SUBDEVICE: { - // find out number of CCSes + // Find out number of CCSes. bool supported = false; cl_int ret_err = CL_SUCCESS; ret_err = @@ -449,8 +449,6 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { - if (device->level == _pi_device::INVALID) // level not updated yet - device->level = getLevel(device); // For root-device there is no partitioning to report. if (device->isRootDevice()) return ReturnValue(pi_device_partition_property{0}); From c541d4b343799223acf1f1de56efb10c3750c8cf Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 19 Jan 2023 21:14:09 -0800 Subject: [PATCH 08/23] Readding a return statement that was removed erroneously Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index ba15ce61a91da..0819d3913b5aa 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -522,6 +522,7 @@ pi_result piDevicePartition(pi_device device, out_devices[i]->index = i % (*out_num_devices); } } + return PI_SUCCESS; } // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in out_num_devices. if (result == CL_DEVICE_NOT_FOUND) { From 6e6d6b29236d2c6a84d844532aaad57ed8bd541e Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Fri, 20 Jan 2023 22:01:34 -0800 Subject: [PATCH 09/23] Major revision Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 273 +++++++++++----------- sycl/plugins/opencl/pi_opencl.hpp | 7 +- sycl/test/abi/pi_opencl_symbol_check.dump | 2 + 3 files changed, 149 insertions(+), 133 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 0819d3913b5aa..6c380a9ae5b2a 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -17,7 +17,6 @@ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #include -#include #include #include #include @@ -265,48 +264,23 @@ static pi_result USMSetIndirectAccess(pi_kernel kernel) { extern "C" { // Helper function -static _pi_device::device_level nextLevel(const _pi_device::device_level currLevel) { +static _pi_device::device_level +nextLevel(const _pi_device::device_level currLevel) { switch (currLevel) { - case _pi_device::ROOTDEVICE : - return _pi_device::SUBDEVICE; - case _pi_device::SUBDEVICE: - return _pi_device::SUBSUBDEVICE; - default: - return _pi_device::INVALID; - } -} - -// Return device level -// _pi_device::ROOTDEVICE -> root device -// _pi_device::SUBDEVICE -> sub-device -// _pi_device::SUBSUBDEVICE -> sub-sub-device (CCS) -// _pi_device::INVALID -> invalid device -_pi_device::device_level getLevel(pi_device device) { - if (!device) - return _pi_device::INVALID; - cl_device_id parentId = nullptr; - clGetDeviceInfo(cast(device), CL_DEVICE_PARENT_DEVICE, - sizeof(cl_device_id), &parentId, NULL); - if (parentId == nullptr) - return _pi_device::ROOTDEVICE; - cl_device_id parentParentId = nullptr; - clGetDeviceInfo(parentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), - &parentParentId, NULL); - if (parentParentId == nullptr) + case _pi_device::ROOTDEVICE: return _pi_device::SUBDEVICE; - cl_device_id parentParentParentId = nullptr; - clGetDeviceInfo(parentParentId, CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), - &parentParentParentId, NULL); - if (parentParentParentId == nullptr) + case _pi_device::SUBDEVICE: return _pi_device::SUBSUBDEVICE; - return _pi_device::INVALID; + default: + return _pi_device::INVALID; + } } pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) { PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); - ReturnHelper ReturnValue(paramValueSize, paramValue, paramValueSizeRet); + ReturnHelper return_value(paramValueSize, paramValue, paramValueSizeRet); switch (paramName) { // TODO: Check regularly to see if support in enabled in OpenCL. // Intel GPU EU device-specific information extensions. @@ -333,7 +307,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, bool supported = false; ret_err = checkDeviceExtensions( - cast(device), + device->cl_device, {"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics"}, supported); if (ret_err != CL_SUCCESS) @@ -356,7 +330,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, } case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { cl_device_type devType = CL_DEVICE_TYPE_DEFAULT; - cl_int res = clGetDeviceInfo(cast(device), CL_DEVICE_TYPE, + cl_int res = clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, sizeof(cl_device_type), &devType, nullptr); // FIXME: here we assume that program built for a root GPU device can be @@ -391,88 +365,120 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, case PI_DEVICE_INFO_PARTITION_PROPERTIES: { // SYCL spec says: if this SYCL device cannot be partitioned into at least // two sub devices then the returned vector must be empty. - pi_uint32 numSubDevices = 0; - if (device->level == _pi_device::INVALID) // level not yet updated - device->level = getLevel(device); + pi_uint32 num_sub_devices = 0; auto ReturnHelper = [&](auto... Partitions) { struct { - pi_device_partition_property Arr[sizeof...(Partitions) + 1]; - } PartitionProperties = {{Partitions..., 0}}; - return ReturnValue(PartitionProperties); + pi_device_partition_property arr[sizeof...(Partitions) + 1]; + } partition_properties = {{Partitions..., 0}}; + return return_value(partition_properties); }; + clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + sizeof(num_sub_devices), &num_sub_devices, nullptr); + cl_device_type device_type; + cl_int res = clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, + sizeof(cl_device_type), &device_type, nullptr); + cl_bool is_gpu = (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); + + // Partition property for CPU + if (!is_gpu) { + if (num_sub_devices < 2) + return return_value(pi_device_partition_property{0}); + cl_device_partition_property props[2] = {0, 0}; + size_t props_ret_size = 0; + clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_PROPERTIES, + sizeof(props), &props, &props_ret_size); + switch (props_ret_size) { + case 0: + return return_value(pi_device_partition_property{0}); + case 1: { + struct { + pi_device_partition_property arr[2]; + } partition_properties = { + {cast(props[0]), 0}}; + return return_value(partition_properties); + } + case 2: { + struct { + pi_device_partition_property arr[3]; + } partition_properties = {{cast(props[0]), + cast(props[1]), + 0}}; + return return_value(partition_properties); + } + default: + return PI_ERROR_INVALID_VALUE; + } + } + + // Partition property for GPU switch (device->level) { - case _pi_device::ROOTDEVICE: { - clGetDeviceInfo( - cast(device), CL_DEVICE_PARTITION_MAX_SUB_DEVICES, - sizeof(numSubDevices), &numSubDevices, nullptr); - if (numSubDevices < 2) { - return ReturnValue(pi_device_partition_property{0}); + case _pi_device::ROOTDEVICE: { + if (num_sub_devices < 2) + return return_value(pi_device_partition_property{0}); + return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } + case _pi_device::SUBDEVICE: { + // Find out number of CCSes. + bool supported = false; + cl_int ret_err = CL_SUCCESS; + ret_err = checkDeviceExtensions( + device->cl_device, {"cl_intel_command_queue_families"}, supported); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + if (!supported) + return return_value(pi_device_partition_property{0}); + cl_queue_family_properties_intel qfprops[3]; + size_t qsize = 0; + clGetDeviceInfo(device->cl_device, + CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), + qfprops, &qsize); + qsize = qsize / sizeof(cl_queue_family_properties_intel); + for (size_t q = 0; q < qsize; q++) { + if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && + qfprops[q].count > num_sub_devices) { + num_sub_devices = qfprops[q].count; } - return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); } - case _pi_device::SUBDEVICE: { - // Find out number of CCSes. - bool supported = false; - cl_int ret_err = CL_SUCCESS; - ret_err = - checkDeviceExtensions(cast(device), - {"cl_intel_command_queue_families"}, supported); - if (ret_err != CL_SUCCESS) - return static_cast(ret_err); - if (!supported) - return ReturnValue(pi_device_partition_property{0}); - cl_queue_family_properties_intel qfprops[3]; - size_t qsize = 0; - clGetDeviceInfo( - cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, - sizeof(qfprops), qfprops, &qsize); - qsize = qsize / sizeof(cl_queue_family_properties_intel); - for (size_t q = 0; q < qsize; q++) { - if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && - qfprops[q].count > numSubDevices) { - numSubDevices = qfprops[q].count; - } - } - if (numSubDevices < 2) { - return ReturnValue(pi_device_partition_property{0}); - } - return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); + if (num_sub_devices < 2) { + return return_value(pi_device_partition_property{0}); } - default: - return ReturnValue(pi_device_partition_property{0}); + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); + } + default: + return return_value(pi_device_partition_property{0}); } } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: - return ReturnValue(pi_device_affinity_domain{ + return return_value(pi_device_affinity_domain{ PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { // For root-device there is no partitioning to report. if (device->isRootDevice()) - return ReturnValue(pi_device_partition_property{0}); + return return_value(pi_device_partition_property{0}); if (device->isSubDevice()) { struct { - pi_device_partition_property Arr[3]; - } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, - 0}}; - return ReturnValue(PartitionProperties); + pi_device_partition_property arr[3]; + } partition_properties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, + 0}}; + return return_value(partition_properties); } - if (device->isSubSubDevice()) { + if (device->isCCS()) { struct { - pi_device_partition_property Arr[2]; - } PartitionProperties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; - return ReturnValue(PartitionProperties); + pi_device_partition_property arr[2]; + } partition_properties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; + return return_value(partition_properties); } - return ReturnValue(pi_device_partition_property{0}); + return return_value(pi_device_partition_property{0}); } default: - cl_int result = clGetDeviceInfo( - cast(device), cast(paramName), - paramValueSize, paramValue, paramValueSizeRet); + cl_int result = + clGetDeviceInfo(device->cl_device, cast(paramName), + paramValueSize, paramValue, paramValueSizeRet); return static_cast(result); } } @@ -482,17 +488,15 @@ pi_result piDevicePartition(pi_device device, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices) { cl_int result = CL_DEVICE_NOT_FOUND; - if (device->level == _pi_device::INVALID) // level not updated yet - device->level = getLevel(device); - // For root-device there is no partitioning to report. + std::vector cl_devices(num_devices); if (device->isRootDevice()) { result = clCreateSubDevices( - cast(device), + device->cl_device, cast(properties), - cast(num_devices), cast(out_devices), - cast(out_num_devices)); + cast(num_devices), cl_devices.data(), out_num_devices); if (out_devices) { for (uint32_t i = 0; i < *out_num_devices; ++i) { + out_devices[i] = new _pi_device(cl_devices[i]); out_devices[i]->level = nextLevel(device->level); } } @@ -500,23 +504,21 @@ pi_result piDevicePartition(pi_device device, cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; pi_uint32 family = 0; - cl_uint partitionMaxSubDevices = 0; - clGetDeviceInfo( - cast(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, - sizeof(qfprops), qfprops, &qsize); + cl_uint sub_device_count = 0; + clGetDeviceInfo(device->cl_device, CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, + sizeof(qfprops), qfprops, &qsize); qsize = qsize / sizeof(cl_queue_family_properties_intel); for (size_t q = 0; q < qsize; q++) { if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && - qfprops[q].count > partitionMaxSubDevices) { + qfprops[q].count > sub_device_count) { family = q; - partitionMaxSubDevices = qfprops[q].count; + sub_device_count = qfprops[q].count; } } - *out_num_devices = partitionMaxSubDevices; + *out_num_devices = sub_device_count; if (out_devices) { for (uint32_t i = 0; i < *out_num_devices; ++i) { - pi_device cloneDevice(device); - out_devices[i] = cloneDevice; + out_devices[i] = new _pi_device(device->cl_device); out_devices[i]->level = nextLevel(device->level); out_devices[i]->family = family; out_devices[i]->index = i % (*out_num_devices); @@ -533,6 +535,16 @@ pi_result piDevicePartition(pi_device device, return cast(result); } +pi_result piDeviceRetain(pi_device device) { + cl_int result = clRetainDevice(device->cl_device); + return cast(result); +} + +pi_result piDeviceRelease(pi_device device) { + cl_int result = clReleaseDevice(device->cl_device); + return cast(result); +} + pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms) { cl_int result = clGetPlatformIDs(cast(num_entries), @@ -559,12 +571,13 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle nativeHandle, pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices) { - cl_int result = clGetDeviceIDs( - cast(platform), cast(device_type), - cast(num_entries), cast(devices), - cast(num_devices)); - + std::vector cl_devices(num_entries); + cl_int result = clGetDeviceIDs(cast(platform), + cast(device_type), + cast(num_entries), cl_devices.data(), + cast(num_devices)); for (pi_uint32 i = 0; i < num_entries; ++i) { + devices[i] = new _pi_device(cl_devices[i]); devices[i]->level = _pi_device::ROOTDEVICE; } // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in num_devices @@ -599,8 +612,8 @@ pi_result piextDeviceSelectBinary(pi_device device, pi_device_binary *images, cl_device_type device_type; constexpr pi_uint32 invalid_ind = std::numeric_limits::max(); cl_int ret_err = - clGetDeviceInfo(cast(device), CL_DEVICE_TYPE, - sizeof(cl_device_type), &device_type, nullptr); + clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, sizeof(cl_device_type), + &device_type, nullptr); if (ret_err != CL_SUCCESS) { *selected_image_ind = invalid_ind; return cast(ret_err); @@ -675,7 +688,7 @@ pi_result piQueueCreate(pi_context context, pi_device device, cl_platform_id curPlatform; cl_int ret_err = - clGetDeviceInfo(cast(device), CL_DEVICE_PLATFORM, + clGetDeviceInfo(device->cl_device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &curPlatform, nullptr); CHECK_ERR_SET_NULL_RET(ret_err, queue, ret_err); @@ -699,7 +712,7 @@ pi_result piQueueCreate(pi_context context, pi_device device, if (version >= OCLV::V2_0) { *queue = cast(clCreateCommandQueue( - cast(context), cast(device), + cast(context), device->cl_device, cast(properties) & SupportByOpenCL, &ret_err)); return cast(ret_err); @@ -715,15 +728,15 @@ pi_result piQueueCreate(pi_context context, pi_device device, device->index, 0}; *queue = cast(clCreateCommandQueueWithProperties( - cast(context), cast(device), - CreationFlagProperties, &ret_err)); + cast(context), device->cl_device, CreationFlagProperties, + &ret_err)); } else { cl_queue_properties CreationFlagProperties[] = { CL_QUEUE_PROPERTIES, cast(properties) & SupportByOpenCL, 0}; *queue = cast(clCreateCommandQueueWithProperties( - cast(context), cast(device), - CreationFlagProperties, &ret_err)); + cast(context), device->cl_device, CreationFlagProperties, + &ret_err)); } return cast(ret_err); } @@ -991,9 +1004,9 @@ pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program, // If clGetDeviceFunctionPointer is in list of extensions if (FuncT) { - pi_ret_err = cast(FuncT(cast(device), - cast(program), func_name, - function_pointer_ret)); + pi_ret_err = + cast(FuncT(device->cl_device, cast(program), + func_name, function_pointer_ret)); // GPU runtime sometimes returns PI_ERROR_INVALID_ARG_VALUE if func address // cannot be found even if kernel exits. As the kernel does exist return // that the address is not available @@ -1184,7 +1197,7 @@ pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, return PI_ERROR_INVALID_VALUE; default: cl_int result = clGetKernelWorkGroupInfo( - cast(kernel), cast(device), + cast(kernel), device->cl_device, cast(param_name), param_value_size, param_value, param_value_size_ret); return static_cast(result); @@ -1226,7 +1239,7 @@ pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, } ret_err = cast(clGetKernelSubGroupInfo( - cast(kernel), cast(device), + cast(kernel), device->cl_device, cast(param_name), input_value_size, input_value, sizeof(size_t), &ret_val, param_value_size_ret)); @@ -1347,7 +1360,7 @@ pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, context, &FuncPtr); if (FuncPtr) { - Ptr = FuncPtr(cast(context), cast(device), + Ptr = FuncPtr(cast(context), device->cl_device, cast(properties), size, alignment, cast(&RetVal)); } @@ -1385,7 +1398,7 @@ pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, context, &FuncPtr); if (FuncPtr) { - Ptr = FuncPtr(cast(context), cast(device), + Ptr = FuncPtr(cast(context), device->cl_device, cast(properties), size, alignment, cast(&RetVal)); } @@ -1891,8 +1904,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piDevicesGet, piDevicesGet) _PI_CL(piDeviceGetInfo, piDeviceGetInfo) _PI_CL(piDevicePartition, piDevicePartition) - _PI_CL(piDeviceRetain, clRetainDevice) - _PI_CL(piDeviceRelease, clReleaseDevice) + _PI_CL(piDeviceRetain, piDeviceRetain) + _PI_CL(piDeviceRelease, piDeviceRelease) _PI_CL(piextDeviceSelectBinary, piextDeviceSelectBinary) _PI_CL(piextGetDeviceFunctionPointer, piextGetDeviceFunctionPointer) _PI_CL(piextDeviceGetNativeHandle, piextDeviceGetNativeHandle) diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index 2e69ad45078b8..b3ffe9a32c3d9 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // This version should be incremented for any change made to this file or its // corresponding .cpp file. @@ -129,12 +130,12 @@ struct _pi_device : _pi_object { SUBSUBDEVICE = 2, INVALID = -1 }; - _pi_device(pi_platform Plt) : Platform{Plt} { + _pi_device(cl_device_id cl_dev) : cl_device{cl_dev} { level = INVALID; family = index = 0; } // PI platform to which this device belongs. - pi_platform Platform; + cl_device_id cl_device; // Info stored for sub-sub device queue creation device_level level; @@ -143,7 +144,7 @@ struct _pi_device : _pi_object { bool isRootDevice(void) { return level == ROOTDEVICE; } bool isSubDevice(void) { return level == SUBDEVICE; } - bool isSubSubDevice(void) { return level == SUBSUBDEVICE; } + bool isCCS(void) { return level == SUBSUBDEVICE; } }; #endif // PI_OPENCL_HPP diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 4db01abfc7286..1f48ffd9d629f 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -11,6 +11,8 @@ piContextCreate piContextGetInfo piDeviceGetInfo piDevicePartition +piDeviceRelease +piDeviceRetain piDevicesGet piEnqueueMemBufferMap piEventCreate From 82b312cf48b125992d3444880dbd775be7b71d3f Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Fri, 20 Jan 2023 22:34:11 -0800 Subject: [PATCH 10/23] Cleanup Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 38 ++++++++++++------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 6c380a9ae5b2a..b86633cb10503 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -29,13 +28,8 @@ #include #include #include -#include #include -#define PI_ASSERT(condition, error) \ - if (!(condition)) \ - return error; - #define CHECK_ERR_SET_NULL_RET(err, ptr, reterr) \ if (err != CL_SUCCESS) { \ if (ptr != nullptr) \ @@ -366,7 +360,11 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, // SYCL spec says: if this SYCL device cannot be partitioned into at least // two sub devices then the returned vector must be empty. pi_uint32 num_sub_devices = 0; + clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + sizeof(num_sub_devices), &num_sub_devices, nullptr); + // Check is done later for devices at root level. + // Helper function to populate property and return success/failure. auto ReturnHelper = [&](auto... Partitions) { struct { pi_device_partition_property arr[sizeof...(Partitions) + 1]; @@ -374,14 +372,15 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return return_value(partition_properties); }; - clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, - sizeof(num_sub_devices), &num_sub_devices, nullptr); + // Identify device type. cl_device_type device_type; cl_int res = clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, nullptr); cl_bool is_gpu = (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); - // Partition property for CPU + // Partition property for non GPU backends. + // For non-GPU backends, partition property are obtained by calling + // clGetDeviceInfo. if (!is_gpu) { if (num_sub_devices < 2) return return_value(pi_device_partition_property{0}); @@ -392,21 +391,12 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, switch (props_ret_size) { case 0: return return_value(pi_device_partition_property{0}); - case 1: { - struct { - pi_device_partition_property arr[2]; - } partition_properties = { - {cast(props[0]), 0}}; - return return_value(partition_properties); - } - case 2: { - struct { - pi_device_partition_property arr[3]; - } partition_properties = {{cast(props[0]), - cast(props[1]), - 0}}; - return return_value(partition_properties); - } + case 1: + ReturnHelper(props[0]); + [[fallthrough]]; + case 2: + ReturnHelper(props[0], props[1]); + [[fallthrough]]; default: return PI_ERROR_INVALID_VALUE; } From 970445a3bf27695f1c26a2dcf217816080f12c22 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Sat, 21 Jan 2023 06:10:35 -0800 Subject: [PATCH 11/23] fix test failure Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 33 +++++++++++++++++++------------ 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index b86633cb10503..b72479d64e213 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1015,10 +1015,12 @@ pi_result piContextCreate(const pi_context_properties *properties, size_t cb, void *user_data1), void *user_data, pi_context *retcontext) { pi_result ret = PI_ERROR_INVALID_OPERATION; + std::vector cl_devices(num_devices); + for (size_t i = 0; i < num_devices; ++i) + cl_devices[i] = devices[i]->cl_device; *retcontext = cast( - clCreateContext(properties, cast(num_devices), - cast(devices), pfn_notify, - user_data, cast(&ret))); + clCreateContext(properties, cast(num_devices), cl_devices.data(), + pfn_notify, user_data, cast(&ret))); return ret; } @@ -1140,10 +1142,13 @@ pi_result piProgramCreateWithBinary( (void)num_metadata_entries; pi_result ret_err = PI_ERROR_INVALID_OPERATION; + std::vector cl_devices(num_devices); + for (size_t i = 0; i < num_devices; ++i) + cl_devices[i] = device_list[i]->cl_device; *ret_program = cast(clCreateProgramWithBinary( - cast(context), cast(num_devices), - cast(device_list), lengths, binaries, - cast(binary_status), cast(&ret_err))); + cast(context), cast(num_devices), cl_devices.data(), + lengths, binaries, cast(binary_status), + cast(&ret_err))); return ret_err; } @@ -1155,13 +1160,15 @@ pi_result piProgramLink(pi_context context, pi_uint32 num_devices, void *user_data, pi_program *ret_program) { pi_result ret_err = PI_ERROR_INVALID_OPERATION; - *ret_program = cast( - clLinkProgram(cast(context), cast(num_devices), - cast(device_list), options, - cast(num_input_programs), - cast(input_programs), - cast(pfn_notify), user_data, - cast(&ret_err))); + std::vector cl_devices(num_devices); + for (size_t i = 0; i < num_devices; ++i) + cl_devices[i] = device_list[i]->cl_device; + *ret_program = cast(clLinkProgram( + cast(context), cast(num_devices), cl_devices.data(), + options, cast(num_input_programs), + cast(input_programs), + cast(pfn_notify), user_data, + cast(&ret_err))); return ret_err; } From 8e1fe6f464f5a8c77c1306daa8cab0f1e88619d7 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 25 Jan 2023 13:36:45 -0800 Subject: [PATCH 12/23] Another round of revision Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 239 ++++++++++++++-------- sycl/plugins/opencl/pi_opencl.hpp | 56 ++--- sycl/test/abi/pi_opencl_symbol_check.dump | 1 + 3 files changed, 186 insertions(+), 110 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index b72479d64e213..f569797c9e250 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -257,18 +257,47 @@ static pi_result USMSetIndirectAccess(pi_kernel kernel) { extern "C" { -// Helper function -static _pi_device::device_level -nextLevel(const _pi_device::device_level currLevel) { - switch (currLevel) { - case _pi_device::ROOTDEVICE: - return _pi_device::SUBDEVICE; - case _pi_device::SUBDEVICE: - return _pi_device::SUBSUBDEVICE; - default: - return _pi_device::INVALID; - } +// Helper functions +// Returns true if the device is a cslice subdevice. +static bool isCCS(pi_device device) { + if (!device) + return false; + return cslice_devices.find(device) != cslice_devices.end(); +} + +// Returns the underlysing Cl device. +// For a regular pi_device, cl_device_id can be obtained by a simple typecast. +// For a cslice subdevice, we explicitly store the cl_device_id and then +// retrieve it when needed. +static cl_device_id getClDevice(pi_device device) { + assert(device); + if (isCCS(device)) + return cslice_devices[device].cl_dev; + else + return cast(device); +} + +// Returns true if the device is a root device. +static bool isRootDevice(pi_device device) { + if (!device) + return false; + cl_device_id parentId = nullptr; + clGetDeviceInfo(getClDevice(device), CL_DEVICE_PARENT_DEVICE, + sizeof(cl_device_id), &parentId, NULL); + if (parentId == nullptr) + return true; + return false; +} + +// Returns the list of underlying cl_devices. +static std::vector getClDevices(pi_uint32 num_devices, + const pi_device *devices) { + std::vector cl_devices(num_devices); + for (size_t i = 0; i < num_devices; ++i) + cl_devices[i] = getClDevice(devices[i]); + return cl_devices; } +// End of helper functions pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, size_t paramValueSize, void *paramValue, @@ -301,7 +330,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, bool supported = false; ret_err = checkDeviceExtensions( - device->cl_device, + getClDevice(device), {"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics"}, supported); if (ret_err != CL_SUCCESS) @@ -324,7 +353,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, } case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { cl_device_type devType = CL_DEVICE_TYPE_DEFAULT; - cl_int res = clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, + cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, sizeof(cl_device_type), &devType, nullptr); // FIXME: here we assume that program built for a root GPU device can be @@ -360,7 +389,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, // SYCL spec says: if this SYCL device cannot be partitioned into at least // two sub devices then the returned vector must be empty. pi_uint32 num_sub_devices = 0; - clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + clGetDeviceInfo(getClDevice(device), CL_DEVICE_PARTITION_MAX_SUB_DEVICES, sizeof(num_sub_devices), &num_sub_devices, nullptr); // Check is done later for devices at root level. @@ -374,7 +403,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, // Identify device type. cl_device_type device_type; - cl_int res = clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, + cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, nullptr); cl_bool is_gpu = (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); @@ -386,7 +415,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return return_value(pi_device_partition_property{0}); cl_device_partition_property props[2] = {0, 0}; size_t props_ret_size = 0; - clGetDeviceInfo(device->cl_device, CL_DEVICE_PARTITION_PROPERTIES, + clGetDeviceInfo(getClDevice(device), CL_DEVICE_PARTITION_PROPERTIES, sizeof(props), &props, &props_ret_size); switch (props_ret_size) { case 0: @@ -403,25 +432,23 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, } // Partition property for GPU - switch (device->level) { - case _pi_device::ROOTDEVICE: { + if (isRootDevice(device)) { if (num_sub_devices < 2) return return_value(pi_device_partition_property{0}); return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } - case _pi_device::SUBDEVICE: { + } else if (!isCCS(device)) { // it is subdevice // Find out number of CCSes. bool supported = false; cl_int ret_err = CL_SUCCESS; ret_err = checkDeviceExtensions( - device->cl_device, {"cl_intel_command_queue_families"}, supported); + getClDevice(device), {"cl_intel_command_queue_families"}, supported); if (ret_err != CL_SUCCESS) return static_cast(ret_err); if (!supported) return return_value(pi_device_partition_property{0}); cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; - clGetDeviceInfo(device->cl_device, + clGetDeviceInfo(getClDevice(device), CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), qfprops, &qsize); qsize = qsize / sizeof(cl_queue_family_properties_intel); @@ -435,10 +462,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return return_value(pi_device_partition_property{0}); } return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); - } - default: + } else // it is CCS return return_value(pi_device_partition_property{0}); - } } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: return return_value(pi_device_affinity_domain{ @@ -446,17 +471,16 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { // For root-device there is no partitioning to report. - if (device->isRootDevice()) + if (isRootDevice(device)) return return_value(pi_device_partition_property{0}); - if (device->isSubDevice()) { + if (!isCCS(device)) { // is subdevice struct { pi_device_partition_property arr[3]; } partition_properties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, 0}}; return return_value(partition_properties); - } - if (device->isCCS()) { + } else { // it is CCS struct { pi_device_partition_property arr[2]; } partition_properties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; @@ -467,7 +491,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, default: cl_int result = - clGetDeviceInfo(device->cl_device, cast(paramName), + clGetDeviceInfo(getClDevice(device), cast(paramName), paramValueSize, paramValue, paramValueSizeRet); return static_cast(result); } @@ -478,25 +502,20 @@ pi_result piDevicePartition(pi_device device, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices) { cl_int result = CL_DEVICE_NOT_FOUND; - std::vector cl_devices(num_devices); - if (device->isRootDevice()) { + if (isRootDevice(device)) { result = clCreateSubDevices( - device->cl_device, + getClDevice(device), cast(properties), - cast(num_devices), cl_devices.data(), out_num_devices); - if (out_devices) { - for (uint32_t i = 0; i < *out_num_devices; ++i) { - out_devices[i] = new _pi_device(cl_devices[i]); - out_devices[i]->level = nextLevel(device->level); - } - } - } else if (device->isSubDevice()) { + cast(num_devices), cast(out_devices), + out_num_devices); + } else if (!isCCS(device)) { cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; pi_uint32 family = 0; cl_uint sub_device_count = 0; - clGetDeviceInfo(device->cl_device, CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, - sizeof(qfprops), qfprops, &qsize); + clGetDeviceInfo(getClDevice(device), + CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), + qfprops, &qsize); qsize = qsize / sizeof(cl_queue_family_properties_intel); for (size_t q = 0; q < qsize; q++) { if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && @@ -508,10 +527,12 @@ pi_result piDevicePartition(pi_device device, *out_num_devices = sub_device_count; if (out_devices) { for (uint32_t i = 0; i < *out_num_devices; ++i) { - out_devices[i] = new _pi_device(device->cl_device); - out_devices[i]->level = nextLevel(device->level); - out_devices[i]->family = family; - out_devices[i]->index = i % (*out_num_devices); + out_devices[i] = cast(new cl_device_id()); + csliceSubDevInfo info; + info.cl_dev = cast(device); + info.family = family; + info.index = i % (*out_num_devices); + cslice_devices.insert({out_devices[i], info}); } } return PI_SUCCESS; @@ -526,12 +547,12 @@ pi_result piDevicePartition(pi_device device, } pi_result piDeviceRetain(pi_device device) { - cl_int result = clRetainDevice(device->cl_device); + cl_int result = clRetainDevice(getClDevice(device)); return cast(result); } pi_result piDeviceRelease(pi_device device) { - cl_int result = clReleaseDevice(device->cl_device); + cl_int result = clReleaseDevice(getClDevice(device)); return cast(result); } @@ -561,15 +582,10 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle nativeHandle, pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices) { - std::vector cl_devices(num_entries); - cl_int result = clGetDeviceIDs(cast(platform), - cast(device_type), - cast(num_entries), cl_devices.data(), - cast(num_devices)); - for (pi_uint32 i = 0; i < num_entries; ++i) { - devices[i] = new _pi_device(cl_devices[i]); - devices[i]->level = _pi_device::ROOTDEVICE; - } + cl_int result = clGetDeviceIDs( + cast(platform), cast(device_type), + cast(num_entries), cast(devices), + cast(num_devices)); // Absorb the CL_DEVICE_NOT_FOUND and just return 0 in num_devices if (result == CL_DEVICE_NOT_FOUND) { assert(num_devices != 0); @@ -602,8 +618,8 @@ pi_result piextDeviceSelectBinary(pi_device device, pi_device_binary *images, cl_device_type device_type; constexpr pi_uint32 invalid_ind = std::numeric_limits::max(); cl_int ret_err = - clGetDeviceInfo(device->cl_device, CL_DEVICE_TYPE, sizeof(cl_device_type), - &device_type, nullptr); + clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, + sizeof(cl_device_type), &device_type, nullptr); if (ret_err != CL_SUCCESS) { *selected_image_ind = invalid_ind; return cast(ret_err); @@ -669,6 +685,7 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device, assert(Properties[2] == 0); if (Properties[2] != 0) return PI_ERROR_INVALID_VALUE; + queue2dev.insert({*Queue, Device}); return piQueueCreate(Context, Device, Flags, Queue); } @@ -678,7 +695,7 @@ pi_result piQueueCreate(pi_context context, pi_device device, cl_platform_id curPlatform; cl_int ret_err = - clGetDeviceInfo(device->cl_device, CL_DEVICE_PLATFORM, + clGetDeviceInfo(getClDevice(device), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &curPlatform, nullptr); CHECK_ERR_SET_NULL_RET(ret_err, queue, ret_err); @@ -702,32 +719,37 @@ pi_result piQueueCreate(pi_context context, pi_device device, if (version >= OCLV::V2_0) { *queue = cast(clCreateCommandQueue( - cast(context), device->cl_device, + cast(context), getClDevice(device), cast(properties) & SupportByOpenCL, &ret_err)); return cast(ret_err); } - if (device->level == _pi_device::SUBSUBDEVICE) { + if (isCCS(device)) { + auto family = cslice_devices[device].family; + auto index = cslice_devices[device].index; cl_queue_properties CreationFlagProperties[] = { CL_QUEUE_PROPERTIES, cast(properties) & SupportByOpenCL, CL_QUEUE_FAMILY_INTEL, - device->family, + family, CL_QUEUE_INDEX_INTEL, - device->index, + index, 0}; *queue = cast(clCreateCommandQueueWithProperties( - cast(context), device->cl_device, CreationFlagProperties, + cast(context), getClDevice(device), CreationFlagProperties, &ret_err)); + } else { cl_queue_properties CreationFlagProperties[] = { CL_QUEUE_PROPERTIES, cast(properties) & SupportByOpenCL, 0}; *queue = cast(clCreateCommandQueueWithProperties( - cast(context), device->cl_device, CreationFlagProperties, + cast(context), getClDevice(device), CreationFlagProperties, &ret_err)); } + if (ret_err == CL_SUCCESS) + queue2dev.insert({*queue, device}); return cast(ret_err); } @@ -742,6 +764,16 @@ pi_result piQueueGetInfo(pi_queue queue, pi_queue_info param_name, case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: // OpenCL doesn't provide API to check the status of the queue. return PI_ERROR_INVALID_VALUE; + case PI_QUEUE_INFO_DEVICE: { + if (queue2dev.find(queue) == queue2dev.end()) + return PI_ERROR_INVALID_VALUE; + pi_device dev = queue2dev[queue]; + assert(param_value); + std::memcpy(param_value, &dev, sizeof(dev)); + if (param_value_size_ret) + *param_value_size_ret = sizeof(pi_device); + return PI_SUCCESS; + } default: cl_int CLErr = clGetCommandQueueInfo( cast(queue), cast(param_name), @@ -995,7 +1027,7 @@ pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program, // If clGetDeviceFunctionPointer is in list of extensions if (FuncT) { pi_ret_err = - cast(FuncT(device->cl_device, cast(program), + cast(FuncT(getClDevice(device), cast(program), func_name, function_pointer_ret)); // GPU runtime sometimes returns PI_ERROR_INVALID_ARG_VALUE if func address // cannot be found even if kernel exits. As the kernel does exist return @@ -1015,13 +1047,12 @@ pi_result piContextCreate(const pi_context_properties *properties, size_t cb, void *user_data1), void *user_data, pi_context *retcontext) { pi_result ret = PI_ERROR_INVALID_OPERATION; - std::vector cl_devices(num_devices); - for (size_t i = 0; i < num_devices; ++i) - cl_devices[i] = devices[i]->cl_device; + std::vector cl_devices = getClDevices(num_devices, devices); *retcontext = cast( clCreateContext(properties, cast(num_devices), cl_devices.data(), pfn_notify, user_data, cast(&ret))); - + if (ret == PI_SUCCESS) + context2devlist.insert({*retcontext, std::make_pair(devices, num_devices)}); return ret; } @@ -1036,6 +1067,7 @@ pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, assert(piContext != nullptr); assert(ownNativeHandle == false); *piContext = reinterpret_cast(nativeHandle); + context2devlist.insert({*piContext, std::make_pair(devices, num_devices)}); return PI_SUCCESS; } @@ -1051,6 +1083,19 @@ pi_result piContextGetInfo(pi_context context, pi_context_info paramName, std::memcpy(paramValue, &result, sizeof(cl_bool)); return PI_SUCCESS; } + case PI_CONTEXT_INFO_DEVICES: { + if (context2devlist.find(context) != context2devlist.end()) { + auto devlist = context2devlist[context]; + const pi_device *devices = devlist.first; + size_t num_devices = devlist.second; + if (paramValueSizeRet) + *paramValueSizeRet = num_devices * sizeof(pi_device); + assert(paramValue); + std::memcpy(paramValue, devices, num_devices * sizeof(pi_device)); + return PI_SUCCESS; + } + return PI_ERROR_INVALID_VALUE; + } default: cl_int result = clGetContextInfo( cast(context), cast(paramName), @@ -1142,16 +1187,43 @@ pi_result piProgramCreateWithBinary( (void)num_metadata_entries; pi_result ret_err = PI_ERROR_INVALID_OPERATION; - std::vector cl_devices(num_devices); - for (size_t i = 0; i < num_devices; ++i) - cl_devices[i] = device_list[i]->cl_device; + std::vector cl_devices = getClDevices(num_devices, device_list); *ret_program = cast(clCreateProgramWithBinary( cast(context), cast(num_devices), cl_devices.data(), lengths, binaries, cast(binary_status), cast(&ret_err))); + if (ret_err == PI_SUCCESS) + program2devlist.insert( + {*ret_program, std::make_pair(device_list, num_devices)}); return ret_err; } +pi_result piProgramGetInfo(pi_program program, pi_program_info paramName, + size_t paramValueSize, void *paramValue, + size_t *paramValueSizeRet) { + assert(program != nullptr); + switch (paramName) { + case PI_PROGRAM_INFO_DEVICES: { + if (program2devlist.find(program) != program2devlist.end()) { + auto devlist = program2devlist[program]; + const pi_device *devices = devlist.first; + size_t num_devices = devlist.second; + if (paramValueSizeRet) + *paramValueSizeRet = num_devices * sizeof(pi_device); + assert(paramValue); + std::memcpy(paramValue, devices, num_devices * sizeof(pi_device)); + return PI_SUCCESS; + } + return PI_ERROR_INVALID_VALUE; + } + default: + cl_int result = clGetProgramInfo( + cast(program), cast(paramName), + paramValueSize, paramValue, paramValueSizeRet); + return static_cast(result); + } +} + pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, @@ -1160,15 +1232,16 @@ pi_result piProgramLink(pi_context context, pi_uint32 num_devices, void *user_data, pi_program *ret_program) { pi_result ret_err = PI_ERROR_INVALID_OPERATION; - std::vector cl_devices(num_devices); - for (size_t i = 0; i < num_devices; ++i) - cl_devices[i] = device_list[i]->cl_device; + std::vector cl_devices = getClDevices(num_devices, device_list); *ret_program = cast(clLinkProgram( cast(context), cast(num_devices), cl_devices.data(), options, cast(num_input_programs), cast(input_programs), cast(pfn_notify), user_data, cast(&ret_err))); + if (ret_err == PI_SUCCESS) + program2devlist.insert( + {*ret_program, std::make_pair(device_list, num_devices)}); return ret_err; } @@ -1194,7 +1267,7 @@ pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, return PI_ERROR_INVALID_VALUE; default: cl_int result = clGetKernelWorkGroupInfo( - cast(kernel), device->cl_device, + cast(kernel), getClDevice(device), cast(param_name), param_value_size, param_value, param_value_size_ret); return static_cast(result); @@ -1236,7 +1309,7 @@ pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, } ret_err = cast(clGetKernelSubGroupInfo( - cast(kernel), device->cl_device, + cast(kernel), getClDevice(device), cast(param_name), input_value_size, input_value, sizeof(size_t), &ret_val, param_value_size_ret)); @@ -1357,7 +1430,7 @@ pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, context, &FuncPtr); if (FuncPtr) { - Ptr = FuncPtr(cast(context), device->cl_device, + Ptr = FuncPtr(cast(context), getClDevice(device), cast(properties), size, alignment, cast(&RetVal)); } @@ -1395,7 +1468,7 @@ pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, context, &FuncPtr); if (FuncPtr) { - Ptr = FuncPtr(cast(context), device->cl_device, + Ptr = FuncPtr(cast(context), getClDevice(device), cast(properties), size, alignment, cast(&RetVal)); } @@ -1938,7 +2011,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piProgramCreate, piProgramCreate) _PI_CL(piclProgramCreateWithSource, piclProgramCreateWithSource) _PI_CL(piProgramCreateWithBinary, piProgramCreateWithBinary) - _PI_CL(piProgramGetInfo, clGetProgramInfo) + _PI_CL(piProgramGetInfo, piProgramGetInfo) _PI_CL(piProgramCompile, clCompileProgram) _PI_CL(piProgramBuild, clBuildProgram) _PI_CL(piProgramLink, piProgramLink) diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index b3ffe9a32c3d9..a2ceb7d4907bf 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -17,12 +17,10 @@ #ifndef PI_OPENCL_HPP #define PI_OPENCL_HPP -#include #include -#include +#include #include #include -#include #include #include #include @@ -120,31 +118,35 @@ inline const OpenCLVersion V3_0(3, 0); } // namespace OCLV -// Define the types that are opaque in pi.h in a manner suitable for OpenCL -// plugin - -struct _pi_device : _pi_object { - enum device_level { - ROOTDEVICE = 0, - SUBDEVICE = 1, - SUBSUBDEVICE = 2, - INVALID = -1 - }; - _pi_device(cl_device_id cl_dev) : cl_device{cl_dev} { - level = INVALID; - family = index = 0; - } - // PI platform to which this device belongs. - cl_device_id cl_device; +// Following of helper data structures to extend OpenCL plugin behavior. +// These data structures are persistent during run-time. +// TODO: Optimizations to clean-up resources during CL objects deletion +// A longer term solution will be to extend pi_* data structures to add new +// fields and get rid of these data structures. + +// This data structure is used to represent information about cslice subdevices. +struct csliceSubDevInfo { + cl_device_id cl_dev; // device to which the cslice belongs + size_t family; + size_t index; +}; - // Info stored for sub-sub device queue creation - device_level level; - pi_uint32 family; // SYCL queue family - pi_uint32 index; // SYCL queue index inside a given family of queues +// This data structure is used to store all cslice subdevices. +// For a regular pi_device, cl_device_id can be obtained by a simple typecast. +// For a cslice subdevice, we explicitly store the cl_device_id and then +// retrieve it when needed. +std::map cslice_devices; - bool isRootDevice(void) { return level == ROOTDEVICE; } - bool isSubDevice(void) { return level == SUBDEVICE; } - bool isCCS(void) { return level == SUBSUBDEVICE; } -}; +// This map is used to capture pi_device info during queue creation and retrieve +// it during getinfo calls. +std::map queue2dev; + +// This map is used to capture pi_device info during context creation and +// retrieve it during getinfo calls. +std::map> context2devlist; + +// This map is used to capture pi_device info during program creation and +// retrieve it during getinfo calls. +std::map> program2devlist; #endif // PI_OPENCL_HPP diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 1f48ffd9d629f..7ddf7158e653a 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -28,6 +28,7 @@ piPluginGetLastError piPluginInit piProgramCreate piProgramCreateWithBinary +piProgramGetInfo piProgramLink piQueueCreate piextQueueCreate From 8ae295629bf490be6f23373e92c1c622be457bc7 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 25 Jan 2023 14:04:41 -0800 Subject: [PATCH 13/23] Minor typos Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 3 +-- sycl/plugins/opencl/pi_opencl.hpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index f569797c9e250..a76ad5b899cf1 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -265,7 +265,7 @@ static bool isCCS(pi_device device) { return cslice_devices.find(device) != cslice_devices.end(); } -// Returns the underlysing Cl device. +// Returns the underlying CL device. // For a regular pi_device, cl_device_id can be obtained by a simple typecast. // For a cslice subdevice, we explicitly store the cl_device_id and then // retrieve it when needed. @@ -688,7 +688,6 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device, queue2dev.insert({*Queue, Device}); return piQueueCreate(Context, Device, Flags, Queue); } - pi_result piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue) { assert(queue && "piQueueCreate failed, queue argument is null"); diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index a2ceb7d4907bf..81797b3536e61 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -118,7 +118,7 @@ inline const OpenCLVersion V3_0(3, 0); } // namespace OCLV -// Following of helper data structures to extend OpenCL plugin behavior. +// Following are helper data structures to extend OpenCL plugin behavior. // These data structures are persistent during run-time. // TODO: Optimizations to clean-up resources during CL objects deletion // A longer term solution will be to extend pi_* data structures to add new From 66468327a9bd9b578be3b13155a20ce7390db7a7 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 25 Jan 2023 20:37:59 -0800 Subject: [PATCH 14/23] Fix some of the test failures Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 98 +++++++++++++++++-------------- sycl/plugins/opencl/pi_opencl.hpp | 4 +- 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index a76ad5b899cf1..91ddfae55be4f 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -406,29 +406,14 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, nullptr); cl_bool is_gpu = (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); - // Partition property for non GPU backends. // For non-GPU backends, partition property are obtained by calling // clGetDeviceInfo. if (!is_gpu) { if (num_sub_devices < 2) return return_value(pi_device_partition_property{0}); - cl_device_partition_property props[2] = {0, 0}; - size_t props_ret_size = 0; - clGetDeviceInfo(getClDevice(device), CL_DEVICE_PARTITION_PROPERTIES, - sizeof(props), &props, &props_ret_size); - switch (props_ret_size) { - case 0: - return return_value(pi_device_partition_property{0}); - case 1: - ReturnHelper(props[0]); - [[fallthrough]]; - case 2: - ReturnHelper(props[0], props[1]); - [[fallthrough]]; - default: - return PI_ERROR_INVALID_VALUE; - } + return ReturnHelper(CL_DEVICE_PARTITION_EQUALLY, + CL_DEVICE_PARTITION_BY_COUNTS); } // Partition property for GPU @@ -764,14 +749,15 @@ pi_result piQueueGetInfo(pi_queue queue, pi_queue_info param_name, // OpenCL doesn't provide API to check the status of the queue. return PI_ERROR_INVALID_VALUE; case PI_QUEUE_INFO_DEVICE: { - if (queue2dev.find(queue) == queue2dev.end()) - return PI_ERROR_INVALID_VALUE; - pi_device dev = queue2dev[queue]; - assert(param_value); - std::memcpy(param_value, &dev, sizeof(dev)); - if (param_value_size_ret) - *param_value_size_ret = sizeof(pi_device); - return PI_SUCCESS; + if (queue2dev.find(queue) != queue2dev.end()) { + pi_device dev = queue2dev[queue]; + if (param_value) + std::memcpy(param_value, &dev, sizeof(dev)); + if (param_value_size_ret) + *param_value_size_ret = sizeof(pi_device); + return PI_SUCCESS; + } + [[fallthrough]]; } default: cl_int CLErr = clGetCommandQueueInfo( @@ -1050,8 +1036,12 @@ pi_result piContextCreate(const pi_context_properties *properties, *retcontext = cast( clCreateContext(properties, cast(num_devices), cl_devices.data(), pfn_notify, user_data, cast(&ret))); - if (ret == PI_SUCCESS) - context2devlist.insert({*retcontext, std::make_pair(devices, num_devices)}); + if (ret == PI_SUCCESS) { + std::vector device_list_vec(num_devices); + for (size_t i = 0; i < num_devices; ++i) + device_list_vec[i] = devices[i]; + context2devlist.insert({*retcontext, device_list_vec}); + } return ret; } @@ -1066,7 +1056,10 @@ pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, assert(piContext != nullptr); assert(ownNativeHandle == false); *piContext = reinterpret_cast(nativeHandle); - context2devlist.insert({*piContext, std::make_pair(devices, num_devices)}); + std::vector device_list_vec(num_devices); + for (size_t i = 0; i < num_devices; ++i) + device_list_vec[i] = devices[i]; + context2devlist.insert({*piContext, device_list_vec}); return PI_SUCCESS; } @@ -1085,15 +1078,15 @@ pi_result piContextGetInfo(pi_context context, pi_context_info paramName, case PI_CONTEXT_INFO_DEVICES: { if (context2devlist.find(context) != context2devlist.end()) { auto devlist = context2devlist[context]; - const pi_device *devices = devlist.first; - size_t num_devices = devlist.second; + size_t num_devices = devlist.size(); if (paramValueSizeRet) *paramValueSizeRet = num_devices * sizeof(pi_device); - assert(paramValue); - std::memcpy(paramValue, devices, num_devices * sizeof(pi_device)); + if (paramValue) + std::memcpy(paramValue, devlist.data(), + num_devices * sizeof(pi_device)); return PI_SUCCESS; } - return PI_ERROR_INVALID_VALUE; + [[fallthrough]]; } default: cl_int result = clGetContextInfo( @@ -1191,9 +1184,12 @@ pi_result piProgramCreateWithBinary( cast(context), cast(num_devices), cl_devices.data(), lengths, binaries, cast(binary_status), cast(&ret_err))); - if (ret_err == PI_SUCCESS) - program2devlist.insert( - {*ret_program, std::make_pair(device_list, num_devices)}); + if (ret_err == PI_SUCCESS) { + std::vector device_list_vec(num_devices); + for (size_t i = 0; i < num_devices; ++i) + device_list_vec[i] = device_list[i]; + program2devlist.insert({*ret_program, device_list_vec}); + } return ret_err; } @@ -1205,15 +1201,15 @@ pi_result piProgramGetInfo(pi_program program, pi_program_info paramName, case PI_PROGRAM_INFO_DEVICES: { if (program2devlist.find(program) != program2devlist.end()) { auto devlist = program2devlist[program]; - const pi_device *devices = devlist.first; - size_t num_devices = devlist.second; + size_t num_devices = devlist.size(); if (paramValueSizeRet) *paramValueSizeRet = num_devices * sizeof(pi_device); - assert(paramValue); - std::memcpy(paramValue, devices, num_devices * sizeof(pi_device)); + if (paramValue) + std::memcpy(paramValue, devlist.data(), + num_devices * sizeof(pi_device)); return PI_SUCCESS; } - return PI_ERROR_INVALID_VALUE; + [[fallthrough]]; } default: cl_int result = clGetProgramInfo( @@ -1238,12 +1234,26 @@ pi_result piProgramLink(pi_context context, pi_uint32 num_devices, cast(input_programs), cast(pfn_notify), user_data, cast(&ret_err))); - if (ret_err == PI_SUCCESS) - program2devlist.insert( - {*ret_program, std::make_pair(device_list, num_devices)}); + if (ret_err == PI_SUCCESS) { + std::vector device_list_vec(num_devices); + for (size_t i = 0; i < num_devices; ++i) + device_list_vec[i] = device_list[i]; + program2devlist.insert({*ret_program, device_list_vec}); + } return ret_err; } +pi_result piProgramGetBuildInfo(pi_program program, pi_device device, + pi_program_build_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + cl_int result = clGetProgramBuildInfo( + cast(program), getClDevice(device), + cast(param_name), param_value_size, param_value, + param_value_size_ret); + return static_cast(result); +} + pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel) { diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index 81797b3536e61..f6351e4d2026c 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -143,10 +143,10 @@ std::map queue2dev; // This map is used to capture pi_device info during context creation and // retrieve it during getinfo calls. -std::map> context2devlist; +std::map> context2devlist; // This map is used to capture pi_device info during program creation and // retrieve it during getinfo calls. -std::map> program2devlist; +std::map> program2devlist; #endif // PI_OPENCL_HPP From c7a36e86c154751bc6c851310715d81e47575654 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 25 Jan 2023 23:22:31 -0800 Subject: [PATCH 15/23] Update api check_dump Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 2 +- sycl/test/abi/pi_opencl_symbol_check.dump | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 91ddfae55be4f..29f4dc6c8b24a 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -2024,7 +2024,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piProgramCompile, clCompileProgram) _PI_CL(piProgramBuild, clBuildProgram) _PI_CL(piProgramLink, piProgramLink) - _PI_CL(piProgramGetBuildInfo, clGetProgramBuildInfo) + _PI_CL(piProgramGetBuildInfo, piProgramGetBuildInfo) _PI_CL(piProgramRetain, clRetainProgram) _PI_CL(piProgramRelease, clReleaseProgram) _PI_CL(piextProgramSetSpecializationConstant, diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 7ddf7158e653a..389517d76e03a 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -28,6 +28,7 @@ piPluginGetLastError piPluginInit piProgramCreate piProgramCreateWithBinary +piProgramGetBuildInfo piProgramGetInfo piProgramLink piQueueCreate From 6792ce700cd5590623f86bb9c9a1e6920ac2b901 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 26 Jan 2023 08:28:23 -0800 Subject: [PATCH 16/23] fix more tests Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 126 +++++++++++++++++------------- 1 file changed, 70 insertions(+), 56 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 29f4dc6c8b24a..b70e5d5d3ad36 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -258,6 +258,7 @@ static pi_result USMSetIndirectAccess(pi_kernel kernel) { extern "C" { // Helper functions + // Returns true if the device is a cslice subdevice. static bool isCCS(pi_device device) { if (!device) @@ -297,6 +298,15 @@ static std::vector getClDevices(pi_uint32 num_devices, cl_devices[i] = getClDevice(devices[i]); return cl_devices; } + +// Return true if the device is a GPU device +static bool isGPU(pi_device device) { + // Identify device type. + cl_device_type device_type; + cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, + sizeof(cl_device_type), &device_type, nullptr); + return (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); +} // End of helper functions pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, @@ -401,75 +411,79 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return return_value(partition_properties); }; - // Identify device type. - cl_device_type device_type; - cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, - sizeof(cl_device_type), &device_type, nullptr); - cl_bool is_gpu = (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); // Partition property for non GPU backends. // For non-GPU backends, partition property are obtained by calling // clGetDeviceInfo. - if (!is_gpu) { - if (num_sub_devices < 2) - return return_value(pi_device_partition_property{0}); - return ReturnHelper(CL_DEVICE_PARTITION_EQUALLY, - CL_DEVICE_PARTITION_BY_COUNTS); - } - - // Partition property for GPU - if (isRootDevice(device)) { + if (!isGPU(device)) { if (num_sub_devices < 2) return return_value(pi_device_partition_property{0}); - return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } else if (!isCCS(device)) { // it is subdevice - // Find out number of CCSes. - bool supported = false; - cl_int ret_err = CL_SUCCESS; - ret_err = checkDeviceExtensions( - getClDevice(device), {"cl_intel_command_queue_families"}, supported); - if (ret_err != CL_SUCCESS) - return static_cast(ret_err); - if (!supported) - return return_value(pi_device_partition_property{0}); - cl_queue_family_properties_intel qfprops[3]; - size_t qsize = 0; - clGetDeviceInfo(getClDevice(device), - CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), - qfprops, &qsize); - qsize = qsize / sizeof(cl_queue_family_properties_intel); - for (size_t q = 0; q < qsize; q++) { - if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && - qfprops[q].count > num_sub_devices) { - num_sub_devices = qfprops[q].count; + cl_int result = + clGetDeviceInfo(getClDevice(device), cast(paramName), + paramValueSize, paramValue, paramValueSizeRet); + return static_cast(result); + } else { + // Partition property for GPU + if (isRootDevice(device)) { + if (num_sub_devices < 2) + return return_value(pi_device_partition_property{0}); + return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } else if (!isCCS(device)) { // it is subdevice + // Find out number of CCSes. + bool supported = false; + cl_int ret_err = CL_SUCCESS; + ret_err = checkDeviceExtensions( + getClDevice(device), {"cl_intel_command_queue_families"}, supported); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + if (!supported) + return return_value(pi_device_partition_property{0}); + cl_queue_family_properties_intel qfprops[3]; + size_t qsize = 0; + clGetDeviceInfo(getClDevice(device), + CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), + qfprops, &qsize); + qsize = qsize / sizeof(cl_queue_family_properties_intel); + for (size_t q = 0; q < qsize; q++) { + if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && + qfprops[q].count > num_sub_devices) { + num_sub_devices = qfprops[q].count; + } } - } - if (num_sub_devices < 2) { + if (num_sub_devices < 2) { + return return_value(pi_device_partition_property{0}); + } + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); + } else // it is CCS return return_value(pi_device_partition_property{0}); - } - return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE); - } else // it is CCS - return return_value(pi_device_partition_property{0}); + } } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: return return_value(pi_device_affinity_domain{ PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { - // For root-device there is no partitioning to report. - if (isRootDevice(device)) - return return_value(pi_device_partition_property{0}); - if (!isCCS(device)) { // is subdevice - struct { - pi_device_partition_property arr[3]; - } partition_properties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, - 0}}; - return return_value(partition_properties); - } else { // it is CCS - struct { - pi_device_partition_property arr[2]; - } partition_properties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; - return return_value(partition_properties); + if (!isGPU(device)) { + cl_int result = + clGetDeviceInfo(getClDevice(device), cast(paramName), + paramValueSize, paramValue, paramValueSizeRet); + return static_cast(result); + } else { + // For root-device there is no partitioning to report. + if (isRootDevice(device)) + return return_value(pi_device_partition_property{0}); + if (!isCCS(device)) { // is subdevice + struct { + pi_device_partition_property arr[3]; + } partition_properties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, + 0}}; + return return_value(partition_properties); + } else { // it is CCS + struct { + pi_device_partition_property arr[2]; + } partition_properties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; + return return_value(partition_properties); + } } return return_value(pi_device_partition_property{0}); } From 3f52fd76d7ae5eee97913ad1c08c3c7b94621a7d Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 26 Jan 2023 10:15:04 -0800 Subject: [PATCH 17/23] formatting issues Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 357b36f4491bd..87cdf8199c7cb 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -308,7 +308,7 @@ static bool isGPU(pi_device device) { // Identify device type. cl_device_type device_type; cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, - sizeof(cl_device_type), &device_type, nullptr); + sizeof(cl_device_type), &device_type, nullptr); return (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); } // End of helper functions @@ -435,8 +435,9 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, // Find out number of CCSes. bool supported = false; cl_int ret_err = CL_SUCCESS; - ret_err = checkDeviceExtensions( - getClDevice(device), {"cl_intel_command_queue_families"}, supported); + ret_err = checkDeviceExtensions(getClDevice(device), + {"cl_intel_command_queue_families"}, + supported); if (ret_err != CL_SUCCESS) return static_cast(ret_err); if (!supported) @@ -444,8 +445,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, cl_queue_family_properties_intel qfprops[3]; size_t qsize = 0; clGetDeviceInfo(getClDevice(device), - CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, sizeof(qfprops), - qfprops, &qsize); + CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, + sizeof(qfprops), qfprops, &qsize); qsize = qsize / sizeof(cl_queue_family_properties_intel); for (size_t q = 0; q < qsize; q++) { if (qfprops[q].capabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL && @@ -479,8 +480,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, struct { pi_device_partition_property arr[3]; } partition_properties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, - 0}}; + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, + 0}}; return return_value(partition_properties); } else { // it is CCS struct { From afe953e53d72017f0ddfc86fd28244daa338a70b Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 26 Jan 2023 11:49:34 -0800 Subject: [PATCH 18/23] Fix AMD test fail Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 87cdf8199c7cb..d0c4a26d2f881 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -303,14 +303,17 @@ static std::vector getClDevices(pi_uint32 num_devices, return cl_devices; } -// Return true if the device is a GPU device -static bool isGPU(pi_device device) { - // Identify device type. - cl_device_type device_type; - cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_TYPE, - sizeof(cl_device_type), &device_type, nullptr); - return (res == CL_SUCCESS) && (device_type == CL_DEVICE_TYPE_GPU); +// Return true if the device is a PVC device +static bool isPVC(pi_device device) { + // Identify device name. + const size_t MAXDEVICENAMELEN = 64; + std::string device_name(MAXDEVICENAMELEN, ' '); + cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_NAME, + MAXDEVICENAMELEN, &device_name[0], nullptr); + return (res == CL_SUCCESS) && + (device_name.find("0x0bd5") != std::string::npos); } + // End of helper functions pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, @@ -415,10 +418,10 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, return return_value(partition_properties); }; - // Partition property for non GPU backends. + // Partition property for non PVC backends. // For non-GPU backends, partition property are obtained by calling // clGetDeviceInfo. - if (!isGPU(device)) { + if (!isPVC(device)) { if (num_sub_devices < 2) return return_value(pi_device_partition_property{0}); cl_int result = @@ -467,7 +470,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { - if (!isGPU(device)) { + if (!isPVC(device)) { cl_int result = clGetDeviceInfo(getClDevice(device), cast(paramName), paramValueSize, paramValue, paramValueSizeRet); From d6748dfa17ef2254d1e012bb0e5e6f5fc7358065 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Thu, 26 Jan 2023 14:13:36 -0800 Subject: [PATCH 19/23] Fix issues in program compile and build Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 30 +++++++++++++++++++++-- sycl/test/abi/pi_opencl_symbol_check.dump | 2 ++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index d0c4a26d2f881..a7b2a5b8f7bf1 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1241,6 +1241,32 @@ pi_result piProgramGetInfo(pi_program program, pi_program_info paramName, } } +pi_result piProgramCompile( + pi_program program, pi_uint32 num_devices, const pi_device *device_list, + const char *options, pi_uint32 num_input_headers, + const pi_program *input_headers, const char **header_include_names, + void (*pfn_notify)(pi_program program, void *user_data), void *user_data) { + std::vector cl_devices = getClDevices(num_devices, device_list); + cl_int result = clCompileProgram( + cast(program), cast(num_devices), cl_devices.data(), + options, cast(num_input_headers), + cast(input_headers), header_include_names, + cast(pfn_notify), user_data); + return static_cast(result); +} + +pi_result piProgramBuild(pi_program program, pi_uint32 num_devices, + const pi_device *device_list, const char *options, + void (*pfn_notify)(pi_program program, + void *user_data), + void *user_data) { + std::vector cl_devices = getClDevices(num_devices, device_list); + cl_int result = clBuildProgram( + cast(program), cast(num_devices), cl_devices.data(), + options, cast(pfn_notify), user_data); + return static_cast(result); +} + pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, @@ -2166,8 +2192,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piclProgramCreateWithSource, piclProgramCreateWithSource) _PI_CL(piProgramCreateWithBinary, piProgramCreateWithBinary) _PI_CL(piProgramGetInfo, piProgramGetInfo) - _PI_CL(piProgramCompile, clCompileProgram) - _PI_CL(piProgramBuild, clBuildProgram) + _PI_CL(piProgramCompile, piProgramCompile) + _PI_CL(piProgramBuild, piProgramBuild) _PI_CL(piProgramLink, piProgramLink) _PI_CL(piProgramGetBuildInfo, piProgramGetBuildInfo) _PI_CL(piProgramRetain, clRetainProgram) diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index aefb3cda1131a..db8d5a2f1b09a 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -27,6 +27,8 @@ piMemImageCreate piPlatformsGet piPluginGetLastError piPluginInit +piProgramBuild +piProgramCompile piProgramCreate piProgramCreateWithBinary piProgramGetBuildInfo From 47bcf52b316c8f986993681694a7e4ba6b88c61f Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Tue, 31 Jan 2023 08:38:48 -0800 Subject: [PATCH 20/23] Address review comments Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 64 ++++++++++++++++++++++--------- sycl/plugins/opencl/pi_opencl.hpp | 31 --------------- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index a7b2a5b8f7bf1..39e32024aba37 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -78,6 +78,37 @@ constexpr size_t MaxMessageSize = 256; thread_local pi_result ErrorMessageCode = PI_SUCCESS; thread_local char ErrorMessage[MaxMessageSize]; +// Following are helper data structures to extend OpenCL plugin behavior. +// These data structures are persistent during run-time. +// TODO: Optimizations to clean-up resources during CL objects deletion +// A longer term solution will be to extend pi_* data structures to add new +// fields and get rid of these data structures. + +// This data structure is used to represent information about cslice subdevices. +struct csliceSubDevInfo { + cl_device_id cl_dev; // device to which the cslice belongs + size_t family; + size_t index; +}; + +// This data structure is used to store all cslice subdevices. +// For a regular pi_device, cl_device_id can be obtained by a simple typecast. +// For a cslice subdevice, we explicitly store the cl_device_id and then +// retrieve it when needed. +static std::map cslice_devices; + +// This map is used to capture pi_device info during queue creation and retrieve +// it during getinfo calls. +static std::map queue2dev; + +// This map is used to capture pi_device info during context creation and +// retrieve it during getinfo calls. +static std::map> context2devlist; + +// This map is used to capture pi_device info during program creation and +// retrieve it during getinfo calls. +static std::map> program2devlist; + // Utility function for setting a message and warning [[maybe_unused]] static void setErrorMessage(const char *message, pi_result error_code) { @@ -303,15 +334,12 @@ static std::vector getClDevices(pi_uint32 num_devices, return cl_devices; } -// Return true if the device is a PVC device +// Return true if the device is a Data Center GPU Max series (PVC) device. static bool isPVC(pi_device device) { - // Identify device name. - const size_t MAXDEVICENAMELEN = 64; - std::string device_name(MAXDEVICENAMELEN, ' '); - cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_NAME, - MAXDEVICENAMELEN, &device_name[0], nullptr); - return (res == CL_SUCCESS) && - (device_name.find("0x0bd5") != std::string::npos); + cl_uint deviceId; + cl_int res = clGetDeviceInfo(getClDevice(device), CL_DEVICE_ID_INTEL, + sizeof(cl_uint), &deviceId, nullptr); + return (res == CL_SUCCESS) && ((deviceId & 0xff0) == 0xbd0); } // End of helper functions @@ -532,15 +560,15 @@ pi_result piDevicePartition(pi_device device, } } *out_num_devices = sub_device_count; - if (out_devices) { - for (uint32_t i = 0; i < *out_num_devices; ++i) { - out_devices[i] = cast(new cl_device_id()); - csliceSubDevInfo info; - info.cl_dev = cast(device); - info.family = family; - info.index = i % (*out_num_devices); - cslice_devices.insert({out_devices[i], info}); - } + if (!out_devices) + return PI_SUCCESS; + for (uint32_t i = 0; i < *out_num_devices; ++i) { + out_devices[i] = cast(new cl_device_id()); + csliceSubDevInfo info; + info.cl_dev = cast(device); + info.family = family; + info.index = i; + cslice_devices.insert({out_devices[i], info}); } return PI_SUCCESS; } @@ -723,7 +751,7 @@ pi_result piQueueCreate(pi_context context, pi_device device, CHECK_ERR_SET_NULL_RET(ret_err, queue, ret_err); - if (version >= OCLV::V2_0) { + if (version < OCLV::V2_0) { *queue = cast(clCreateCommandQueue( cast(context), getClDevice(device), cast(properties) & SupportByOpenCL, diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index f6351e4d2026c..68315128af080 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -118,35 +118,4 @@ inline const OpenCLVersion V3_0(3, 0); } // namespace OCLV -// Following are helper data structures to extend OpenCL plugin behavior. -// These data structures are persistent during run-time. -// TODO: Optimizations to clean-up resources during CL objects deletion -// A longer term solution will be to extend pi_* data structures to add new -// fields and get rid of these data structures. - -// This data structure is used to represent information about cslice subdevices. -struct csliceSubDevInfo { - cl_device_id cl_dev; // device to which the cslice belongs - size_t family; - size_t index; -}; - -// This data structure is used to store all cslice subdevices. -// For a regular pi_device, cl_device_id can be obtained by a simple typecast. -// For a cslice subdevice, we explicitly store the cl_device_id and then -// retrieve it when needed. -std::map cslice_devices; - -// This map is used to capture pi_device info during queue creation and retrieve -// it during getinfo calls. -std::map queue2dev; - -// This map is used to capture pi_device info during context creation and -// retrieve it during getinfo calls. -std::map> context2devlist; - -// This map is used to capture pi_device info during program creation and -// retrieve it during getinfo calls. -std::map> program2devlist; - #endif // PI_OPENCL_HPP From ad3d1dabeaa3cda9970886bebaa176a34bcd9d57 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 1 Feb 2023 06:46:31 -0800 Subject: [PATCH 21/23] Address more review comments Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 39e32024aba37..2e90b5a5f1398 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -317,6 +317,8 @@ static cl_device_id getClDevice(pi_device device) { static bool isRootDevice(pi_device device) { if (!device) return false; + if (isCCS(device)) + return false; cl_device_id parentId = nullptr; clGetDeviceInfo(getClDevice(device), CL_DEVICE_PARENT_DEVICE, sizeof(cl_device_id), &parentId, NULL); @@ -569,6 +571,8 @@ pi_result piDevicePartition(pi_device device, info.family = family; info.index = i; cslice_devices.insert({out_devices[i], info}); + if (result = clRetainDevice(info.cl_dev)) + return cast(result); } return PI_SUCCESS; } @@ -806,8 +810,8 @@ pi_result piQueueGetInfo(pi_queue queue, pi_queue_info param_name, if (param_value_size_ret) *param_value_size_ret = sizeof(pi_device); return PI_SUCCESS; - } - [[fallthrough]]; + } else + return PI_ERROR_INVALID_VALUE; } default: cl_int CLErr = clGetCommandQueueInfo( @@ -2112,8 +2116,20 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. +// We clear all the 'map' variables here. pi_result piTearDown(void *PluginParameter) { (void)PluginParameter; + for (auto &entry : cslice_devices) + if (entry.first) + delete entry.first; + cslice_devices.clear(); + queue2dev.clear(); + for (auto &entry : context2devlist) + entry.second.clear(); + context2devlist.clear(); + for (auto &entry : program2devlist) + entry.second.clear(); + program2devlist.clear(); return PI_SUCCESS; } From daf8f73008e53a000f09d805b2d3773aef709dc6 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 1 Feb 2023 07:18:55 -0800 Subject: [PATCH 22/23] Address Werror issues Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 2e90b5a5f1398..01a18cad736c4 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -571,8 +571,9 @@ pi_result piDevicePartition(pi_device device, info.family = family; info.index = i; cslice_devices.insert({out_devices[i], info}); - if (result = clRetainDevice(info.cl_dev)) - return cast(result); + auto res = clRetainDevice(info.cl_dev); + if (res) + return cast(res); } return PI_SUCCESS; } @@ -2119,9 +2120,6 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, // We clear all the 'map' variables here. pi_result piTearDown(void *PluginParameter) { (void)PluginParameter; - for (auto &entry : cslice_devices) - if (entry.first) - delete entry.first; cslice_devices.clear(); queue2dev.clear(); for (auto &entry : context2devlist) From 142fa3174f4a9abbc807d640bf97f4d291f0c573 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 1 Feb 2023 10:16:57 -0800 Subject: [PATCH 23/23] Address Werror issues Signed-off-by: Arvind Sudarsanam --- sycl/plugins/opencl/pi_opencl.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 01a18cad736c4..18020ac100eb3 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -2120,14 +2120,6 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, // We clear all the 'map' variables here. pi_result piTearDown(void *PluginParameter) { (void)PluginParameter; - cslice_devices.clear(); - queue2dev.clear(); - for (auto &entry : context2devlist) - entry.second.clear(); - context2devlist.clear(); - for (auto &entry : program2devlist) - entry.second.clear(); - program2devlist.clear(); return PI_SUCCESS; }