From 2bc3c1a2c1b1aa2807b06244146a0cf50e083781 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Thu, 9 Sep 2021 20:12:29 -0700 Subject: [PATCH 1/6] [SYCL][L0] Make all L0 events have device-visibility by default, and only create/signal/query their host-visible proxy event on demand. Signed-off-by: Sergey V Maslov --- sycl/plugins/level_zero/pi_level_zero.cpp | 189 ++++++++++++++++++---- sycl/plugins/level_zero/pi_level_zero.hpp | 33 +++- 2 files changed, 185 insertions(+), 37 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 338449afcda5..58697a9a5d3c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -181,6 +181,13 @@ static void zePrint(const char *Format, ...) { } } +// Controls whether device-scope events are used. +static const bool ZeAllHostVisibleEvents = [] { + const auto DeviceEventsStr = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_EVENTS"); + bool result = (DeviceEventsStr ? (std::atoi(DeviceEventsStr) == 0) : false); + return result; +}(); + // Helper function to implement zeHostSynchronize. // The behavior is to avoid infinite wait during host sync under ZE_DEBUG. // This allows for a much more responsive debugging of hangs. @@ -379,8 +386,8 @@ pi_result _pi_mem::removeMapping(void *MappedTo, Mapping &MapInfo) { } pi_result -_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, - size_t &Index) { +_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, + size_t &Index, bool HostVisible) { // Maximum number of events that can be present in an event ZePool is captured // here. Setting it to 256 gave best possible performance for several // benchmarks. @@ -396,10 +403,23 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, return PI_INVALID_VALUE; } + // Setup for host-visible pool as needed. + ze_event_pool_flag_t ZePoolFlag = {}; + ze_event_pool_handle_t *ZePool = [&] { + if (ZeAllHostVisibleEvents) { + ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + return &ZeEventPool; + } else if (HostVisible) { + ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + return &ZeHostVisibleEventPool; + } else { + return &ZeEventPool; + } + }(); + Index = 0; // Create one event ZePool per MaxNumEventsPerPool events - if ((ZeEventPool == nullptr) || - (NumEventsAvailableInEventPool[ZeEventPool] == 0)) { + if ((*ZePool == nullptr) || (NumEventsAvailableInEventPool[*ZePool] == 0)) { // Creation of the new ZePool with record in NumEventsAvailableInEventPool // and initialization of the record in NumEventsUnreleasedInEventPool must // be done atomically. Otherwise it is possible that @@ -414,34 +434,28 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, ZeStruct ZeEventPoolDesc; ZeEventPoolDesc.count = MaxNumEventsPerPool; - - // Make all events visible on the host. - // TODO: events that are used only on device side APIs can be optimized - // to not be from the host-visible pool. - // - ZeEventPoolDesc.flags = - ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + ZeEventPoolDesc.flags = ZePoolFlag | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; std::vector ZeDevices; std::for_each(Devices.begin(), Devices.end(), [&](pi_device &D) { ZeDevices.push_back(D->ZeDevice); }); ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), - &ZeDevices[0], &ZeEventPool)); - NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool - 1; - NumEventsUnreleasedInEventPool[ZeEventPool] = MaxNumEventsPerPool; + &ZeDevices[0], ZePool)); + NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1; + NumEventsUnreleasedInEventPool[*ZePool] = MaxNumEventsPerPool; } else { std::lock_guard NumEventsAvailableInEventPoolGuard( NumEventsAvailableInEventPoolMutex); - Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[ZeEventPool]; - --NumEventsAvailableInEventPool[ZeEventPool]; + Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool]; + --NumEventsAvailableInEventPool[*ZePool]; } - ZePool = ZeEventPool; + Pool = *ZePool; return PI_SUCCESS; } -pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { - ze_event_pool_handle_t ZePool = Event->ZeEventPool; +pi_result +_pi_context::decrementUnreleasedEventsInPool(ze_event_pool_handle_t &ZePool) { if (!ZePool) { // This must be an interop event created on a users's pool. // Do nothing. @@ -460,9 +474,9 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { // multiple pi_context::ZeEventPool can be created if all slots in the pool // are already used up. So nullifying pi_context::ZeEventPool may point // a different EventPool than Event->ZeEventPool. - if (ZeEventPool == Event->ZeEventPool) + if (ZeEventPool == ZePool) ZeEventPool = nullptr; - Event->ZeEventPool = nullptr; + ZePool = nullptr; } return PI_SUCCESS; } @@ -761,6 +775,8 @@ pi_result _pi_context::finalize() { NumEventsUnreleasedInEventPoolMutex); if (ZeEventPool) ZE_CALL(zeEventPoolDestroy, (ZeEventPool)); + if (ZeHostVisibleEventPool) + ZE_CALL(zeEventPoolDestroy, (ZeHostVisibleEventPool)); // Destroy the command list used for initializations ZE_CALL(zeCommandListDestroy, (ZeCommandListInit)); @@ -1050,7 +1066,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // therefore that this Queue is idle. bool CurrentlyEmpty = this->LastCommandEvent == nullptr; - this->LastCommandEvent = CommandList->second.EventList.back(); + // The list can be empty if command-list only contains signals of proxy + // events. + if (!CommandList->second.EventList.empty()) + this->LastCommandEvent = CommandList->second.EventList.back(); // Batch if allowed to, but don't batch if we know there are no kernels // from this queue that are currently executing. This is intended to get @@ -1244,7 +1263,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( PI_ASSERT(EventList[I] != nullptr, PI_INVALID_VALUE); auto ZeEvent = EventList[I]->ZeEvent; - if (FilterEventWaitList) { + // Avoid polling of the device-scope events. + // TODO: be more fine-grain and check individual events. + if (FilterEventWaitList && ZeAllHostVisibleEvents) { auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeEvent)); if (Res == ZE_RESULT_SUCCESS) { // Event has already completed, don't put it into the list @@ -1539,6 +1560,8 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, if (NumPlatforms) *NumPlatforms = PiPlatformsCache->size(); + zePrint("Using %s events\n", + ZeAllHostVisibleEvents ? "all host-visible" : "device-only"); return PI_SUCCESS; } @@ -4371,6 +4394,68 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel, // // Events // +ze_event_handle_t _pi_event::getHostVisibleEvent() const { + if (ZeAllHostVisibleEvents) { + return ZeEvent; + } else if (ZeHostVisibleEvent) { + return ZeHostVisibleEvent; + } else { + die("The host-visible proxy event missing"); + } +} + +pi_result +_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) { + + if (ZeAllHostVisibleEvents) { + HostVisibleEvent = ZeEvent; + } else if (ZeHostVisibleEvent) { + HostVisibleEvent = ZeHostVisibleEvent; + } else { + size_t Index; + ze_event_pool_handle_t ZeEventPool = {}; + if (auto Res = + Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index, true)) + return Res; + + // Create a "proxy" host-visible event. + ZeStruct ZeEventDesc; + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + ZeEventDesc.wait = 0; + ZeEventDesc.index = Index; + + ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeHostVisibleEvent)); + ZeHostVisibleEventPool = ZeEventPool; + HostVisibleEvent = ZeHostVisibleEvent; + + // Submit the command(s) signalling the proxy event to the queue. + // We have to first submit a wait for the device-only event for which this + // proxy is created. + // + // Get a new command list to be used on this call + { + std::lock_guard Lock(Queue->PiQueueMutex); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + pi_command_list_ptr_t CommandList{}; + if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, + false, OkToBatch)) + return Res; + + ZE_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + ZE_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, ZeHostVisibleEvent)); + + if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch)) + return Res; + } + } + return PI_SUCCESS; +} + pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { size_t Index = 0; ze_event_pool_handle_t ZeEventPool = {}; @@ -4379,12 +4464,21 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { ze_event_handle_t ZeEvent; ZeStruct ZeEventDesc; - // We have to set the SIGNAL flag as HOST scope because the - // Level-Zero plugin implementation waits for the events to complete - // on the host. - ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - ZeEventDesc.wait = 0; ZeEventDesc.index = Index; + ZeEventDesc.wait = 0; + // + // Set the scope to "device" for every event. This is sufficient for global + // device access and peer device access. If needed to be waited on the host + // we are doing special handling, see piEventsWait. + // + // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be + // used in some circumstances. + // + if (ZeAllHostVisibleEvents) { + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + } else { + ZeEventDesc.signal = 0; + } ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent)); @@ -4434,9 +4528,19 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, return Res; } } +#if 0 + if (!ZeAllHostVisibleEvents) + return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, + pi_int32{CL_RUNNING}); +#endif + + // Make sure that we query the host-visible event. + ze_event_handle_t ZeHostVisibleEvent; + if (auto Res = Event->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) + return Res; ze_result_t ZeResult; - ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent)); + ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeHostVisibleEvent)); if (ZeResult == ZE_RESULT_SUCCESS) { return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, pi_int32{CL_COMPLETE}); // Untie from OpenCL @@ -4644,6 +4748,20 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { return PI_INVALID_EVENT; } + // Make sure to add all host-visible "proxy" event signals if needed. + // This ensures that all signalling commands are submitted below and + // thus proxy events can be waited without a deadlock. + // + // TODO: consider creating just single host-visible proxy event to + // wait for completion of multiple device-scope events requested. + // + for (uint32_t I = 0; I < NumEvents; I++) { + ze_event_handle_t ZeHostVisibleEvent; + if (auto Res = + EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) + return Res; + } + // Submit dependent open command lists for execution, if any for (uint32_t I = 0; I < NumEvents; I++) { auto Queue = EventList[I]->Queue; @@ -4659,7 +4777,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { } for (uint32_t I = 0; I < NumEvents; I++) { - ze_event_handle_t ZeEvent = EventList[I]->ZeEvent; + ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent(); zePrint("ZeEvent = %#lx\n", pi_cast(ZeEvent)); ZE_CALL(zeHostSynchronize, (ZeEvent)); @@ -4725,11 +4843,20 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) { if (Event->OwnZeEvent) { ZE_CALL(zeEventDestroy, (Event->ZeEvent)); } + if (Event->ZeHostVisibleEvent) { + ZE_CALL(zeEventDestroy, (Event->ZeHostVisibleEvent)); + } auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) + if (auto Res = Context->decrementUnreleasedEventsInPool(Event->ZeEventPool)) return Res; + if (Event->ZeHostVisibleEvent) { + if (auto Res = Context->decrementUnreleasedEventsInPool( + Event->ZeHostVisibleEventPool)) + return Res; + } + // We intentionally incremented the reference counter when an event is // created so that we can avoid pi_queue is released before the associated // pi_event is released. Here we have to decrement it so pi_queue diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 748fb331d902..0e65b46bf9d8 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -536,12 +536,14 @@ struct _pi_context : _pi_object { bool AllowBatching = false); // Get index of the free slot in the available pool. If there is no available - // pool then create new one. - pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &); + // pool then create new one. The HostVisible parameter tells if we need a + // slot for a host-visible event. + pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, + bool HostVisible = false); // If event is destroyed then decrement number of events living in the pool // and destroy the pool if there are no unreleased events. - pi_result decrementUnreleasedEventsInPool(pi_event Event); + pi_result decrementUnreleasedEventsInPool(ze_event_pool_handle_t &ZePool); // Store USM allocator context(internal allocator structures) // for USM shared and device allocations. There is 1 allocator context @@ -561,11 +563,17 @@ struct _pi_context : _pi_object { private: // Following member variables are used to manage assignment of events // to event pools. - // TODO: These variables may be moved to pi_device and pi_platform - // if appropriate. + // + // TODO: Create pi_event_pool class to encapsulate working with pools. + // This will avoid needing the use of maps below, and cleanup the + // pi_context overall. + // // Event pool to which events are being added to. - ze_event_pool_handle_t ZeEventPool; + ze_event_pool_handle_t ZeEventPool = {nullptr}; + // Event pool to which host-visible events are added to. + ze_event_pool_handle_t ZeHostVisibleEventPool = {nullptr}; + // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. std::unordered_map @@ -902,6 +910,19 @@ struct _pi_event : _pi_object { // Level Zero event pool handle. ze_event_pool_handle_t ZeEventPool; + // In case we use device-only events/pools these are their host-visible + // counterparts. The idea is that two Level-Zero events co-exist: + // - one is always created with device-scope and used for GPU book-keeping. + // - the other host-visible proxy event is created on demand when we need + // to query/wait on a device-scope event from the host. + // + ze_event_handle_t ZeHostVisibleEvent = {nullptr}; + ze_event_pool_handle_t ZeHostVisibleEventPool = {nullptr}; + // Get the host-visible event or create one and enqueue its signal. + pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); + // Get the host-visible event ensuring that one was already created before. + ze_event_handle_t getHostVisibleEvent() const; + // Level Zero command list where the command signaling this event was appended // to. This is currently used to remember/destroy the command list after all // commands in it are completed, i.e. this event signaled. From 586d7f388016c3e3aa0e9ea8ace6728b6c517534 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Fri, 10 Sep 2021 10:12:45 -0700 Subject: [PATCH 2/6] disable by default Signed-off-by: Sergey V Maslov --- sycl/doc/EnvironmentVariables.md | 7 +++++-- sycl/plugins/level_zero/pi_level_zero.cpp | 21 ++++++++++----------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 43af7fc852f0..f72e90ef6e7a 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -1,4 +1,4 @@ -# Environment Variables +#Environment Variables This document describes environment variables that are having effect on DPC++ compiler and runtime. @@ -23,7 +23,9 @@ subject to change. Do not rely on these variables in production code. | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | -| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{{X.Y.Z.W}}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| +| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{ + { X.Y.Z.W } +}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| | `SYCL_QUEUE_THREAD_POOL_SIZE` | Positive integer | Number of threads in thread pool of queue. | | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | @@ -34,6 +36,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | | `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero plugin. | +| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled the Level Zero plugin would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. The default is 0, meaning all events are host-visible. | | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. | | `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. | | `SYCL_ENABLE_PCI` | Integer | When set to 1, enables obtaining the GPU PCI address when using the Level Zero backend. The default is 0. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 58697a9a5d3c..b0e4231f8298 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -183,8 +183,9 @@ static void zePrint(const char *Format, ...) { // Controls whether device-scope events are used. static const bool ZeAllHostVisibleEvents = [] { - const auto DeviceEventsStr = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_EVENTS"); - bool result = (DeviceEventsStr ? (std::atoi(DeviceEventsStr) == 0) : false); + const auto DeviceEventsStr = + std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); + bool result = (DeviceEventsStr ? (std::atoi(DeviceEventsStr) == 0) : true); return result; }(); @@ -4419,6 +4420,12 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) { return Res; // Create a "proxy" host-visible event. + // + // TODO: consider creating just single host-visible proxy event to + // represent multiple device-scope events. E.g. have a host-visible + // event at the end of each command-list to represent device-scope + // events from every command in that command-list. + // ZeStruct ZeEventDesc; ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; ZeEventDesc.wait = 0; @@ -4528,11 +4535,6 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, return Res; } } -#if 0 - if (!ZeAllHostVisibleEvents) - return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, - pi_int32{CL_RUNNING}); -#endif // Make sure that we query the host-visible event. ze_event_handle_t ZeHostVisibleEvent; @@ -4545,7 +4547,7 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, pi_int32{CL_COMPLETE}); // Untie from OpenCL } - // TODO: We don't know if the status is queueed, submitted or running. + // TODO: We don't know if the status is queued, submitted or running. // For now return "running", as others are unlikely to be of // interest. return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, @@ -4752,9 +4754,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // This ensures that all signalling commands are submitted below and // thus proxy events can be waited without a deadlock. // - // TODO: consider creating just single host-visible proxy event to - // wait for completion of multiple device-scope events requested. - // for (uint32_t I = 0; I < NumEvents; I++) { ze_event_handle_t ZeHostVisibleEvent; if (auto Res = From 681b37fc9e733ddb462fd64bb8097082784a7532 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Fri, 10 Sep 2021 10:15:05 -0700 Subject: [PATCH 3/6] undo inapproprate clang-format Signed-off-by: Sergey V Maslov --- sycl/doc/EnvironmentVariables.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index f72e90ef6e7a..a44ea6edaf69 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -23,9 +23,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | -| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{ - { X.Y.Z.W } -}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| +| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{{ X.Y.Z.W }}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| | `SYCL_QUEUE_THREAD_POOL_SIZE` | Positive integer | Number of threads in thread pool of queue. | | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | From 3792d2ef9f95f1b83385be828a307d70081c6582 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Fri, 10 Sep 2021 10:15:50 -0700 Subject: [PATCH 4/6] undo inapproprate clang-format Signed-off-by: Sergey V Maslov --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index a44ea6edaf69..55c9a9b1a665 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -23,7 +23,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | -| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{{ X.Y.Z.W }}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| +| `SYCL_DEVICE_ALLOWLIST` | A list of devices and their driver version following the pattern: `BackendName:XXX,DeviceType:YYY,DeviceVendorId:0xXYZW,DriverVersion:{{X.Y.Z.W}}`. Also may contain `PlatformVersion`, `DeviceName` and `PlatformName`. There is no fixed order of properties in the pattern. | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu` or `acc`. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| | `SYCL_QUEUE_THREAD_POOL_SIZE` | Positive integer | Number of threads in thread pool of queue. | | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | From 5ddc7a9e6a84ccc48802d944429efbd5b8a1efdd Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Fri, 10 Sep 2021 10:16:22 -0700 Subject: [PATCH 5/6] undo inapproprate clang-format Signed-off-by: Sergey V Maslov --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 55c9a9b1a665..7d1fa27a6e90 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -1,4 +1,4 @@ -#Environment Variables +# Environment Variables This document describes environment variables that are having effect on DPC++ compiler and runtime. From 170da1878bc8a76f9e62488d6a3926620d170ae7 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Fri, 10 Sep 2021 15:08:10 -0700 Subject: [PATCH 6/6] use ZE_EVENT_SCOPE_FLAG_DEVICE signal scope Signed-off-by: Sergey V Maslov --- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index b0e4231f8298..b7b70a2ded16 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4484,7 +4484,7 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { if (ZeAllHostVisibleEvents) { ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; } else { - ZeEventDesc.signal = 0; + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE; } ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));