intel · AlexeySachkov · Jun 18, 2025 · Mar 20, 2025 · Mar 26, 2025 · Mar 31, 2025
@@ -19,6 +19,7 @@ The Feature Test Macro SYCL\_EXT\_INTEL\_DEVICE\_INFO will be defined as one of
 | 4     | Free device memory query is supported |
 | 5     | Device ID is supported |
 | 6     | Memory clock rate and bus width queries are supported |
+| 7     | Throttle reasons, fan speed and power limits queries are supported |
 
 
 
@@ -489,6 +490,144 @@ Then the memory bus width can be obtained using the standard get\_info() interfa
       auto MemoryBusWidth = dev.get_info<ext::intel::info::device::memory_bus_width>();
     }
 
+# Throttle reason #
+
+A new device descriptor is added which provides the current clock throttle reasons.
+A new enum is added with the list of possible throttle reasons.
+
+## Version ##
+
+The extension supports this query in version 7 and later.
+
+## Throttle reasons ##
+
+| Reason             | Description |
+| ------------------ | ----------- |
+| `power_cap` | The clock frequency is throttled due to hitting the power limit. |
+| `current_limit` | The clock frequency is throttled due to hitting the current limit. |
+| `thermal_limit` | The clock frequency is throttled due to hitting the thermal limit. |
+| `psu_alert` | The clock frequency is throttled due to power supply assertion. |
+| `sw_range` | The clock frequency is throttled due to software supplied frequency range. |
+| `hw_range` | The clock frequency is throttled because there is a sub block that has a lower frequency when it receives clocks. |
+| `other` | The clock frequency is throttled due to other reason. |
+
+
+```
+namespace sycl::ext::intel {
+
+  enum class throttle_reason {
+    power_cap,
+    current_limit,
+    thermal_limit,
+    psu_alert,
+    sw_range,
+    hw_range,
+    other
+  }
+
+}
+```
+
+## Device Information Descriptors ##
+
+| Device Descriptors | Return Type | Description |
+| ------------------ | ----------- | ----------- |
+| `ext::intel::info::device::current_clock_throttle_reasons` | `std::vector<ext::intel::throttle_reason>` | Returns the set of throttle reasons describing why the frequency is being limited by the hardware. Returns empty set if frequency is not throttled. |
+
+
+## Aspects ##
+
+A new aspect, `ext_intel_current_clock_throttle_reasons`, is added.
+
+
+## Error Condition ##
+
+Throws a synchronous `exception` with the `errc::feature_not_supported` error code if the device does not have `aspect::ext_intel_current_clock_throttle_reasons`.
+
+## Example Usage ##
+
+Then the current clock throttle reasons can be obtained using the standard `get_info()` interface.
+
+```
+if (dev.has(aspect::ext_intel_current_clock_throttle_reasons)) {
+  std::vector<ext::inte::info::throttle_reason> Reasons = dev.get_info<ext::intel::info::device::current_clock_throttle_reasons<>();
+}
+```
+
+
+# Fan speed #
+
+A new device descriptor is added which provides the fan speed for the device.
+
+## Version ##
+
+The extension supports this query in version 7 and later.
+
+## Device Information Descriptors ##
+
+| Device Descriptors | Return Type | Description |
+| ------------------ | ----------- | ----------- |
+| `ext::intel::info::device::fan_speed` | `int32_t` | Returns the current speed of device's fan (as a percentage of the maximum speed of the fan). If fan speed can't be measured then returns -1. If there are multiple fans, then returns maximum value. |
+
+
+## Aspects ##
+
+A new aspect, `ext_intel_fan_speed`, is added.
+
+
+## Error Condition ##
+
+Throws a synchronous `exception` with the `errc::feature_not_supported` error code if the device does not have `aspect::ext_intel_fan_speed`.
+
+## Example Usage ##
+
+Then the fan speed can be obtained using the standard `get_info()` interface.
+
+```
+    if (dev.has(aspect::ext_intel_fan_speed)) {
+      auto FanSpeed = dev.get_info<ext::intel::info::device::fan_speed>();
+    }
+```
+
+# Power limits #
+
+New device descriptors are added which provide the maximum and minimum power limits for the device.
+
+## Version ##
+
+The extension supports this query in version 7 and later.
+
+## Device Information Descriptors ##
+
+| Device Descriptors | Return Type | Description |
+| ------------------ | ----------- | ----------- |
+|`ext::intel::info::device::min_power_limit` |`int32_t` | Returns the minimum power limit of the device in milliwatts. Returns -1 if the limit is not known. |
+|`ext::intel::info::device::max_power_limit` |`int32_t` | Returns the maximum power limit of the device in milliwatts. Returns -1 if the limit is not known. |
+
+
+## Aspects ##
+
+A new aspect, `ext_intel_power_limits`, is added.
+
+
+## Error Condition ##
+
+Throws a synchronous `exception` with the `errc::feature_not_supported` error code if the device does not have `aspect::ext_intel_power_limits`.
+
+## Example Usage ##
+
+Then the power limits can be obtained using the standard `get_info()` interface.
+
+```
+    if (dev.has(aspect::ext_intel_power_limits)) {
+      auto Min = dev.get_info<ext::intel::info::device::min_power_limit>();
+      auto Max = dev.get_info<ext::intel::info::device::max_power_limit>();
+    }
+```
+
+
+
+
 # Deprecated queries #
 
 The table below lists deprecated, that would soon be removed and their replacements:

@@ -119,6 +119,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::umf
     Threads::Threads
     cudadrv
+    CUDA::nvml
 )
 
 target_include_directories(${TARGET_NAME} PRIVATE

@@ -12,6 +12,7 @@
 #include "logger/ur_logger.hpp"
 
 #include <cuda.h>
+#include <nvml.h>
 
 #include <sstream>
 
@@ -36,6 +37,23 @@ ur_result_t mapErrorUR(CUresult Result) {
   }
 }
 
+ur_result_t mapErrorUR(nvmlReturn_t Result) {
+  switch (Result) {
+  case NVML_SUCCESS:
+    return UR_RESULT_SUCCESS;
+  case NVML_ERROR_NOT_SUPPORTED:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case NVML_ERROR_GPU_IS_LOST:
+    return UR_RESULT_ERROR_DEVICE_LOST;
+  case NVML_ERROR_MEMORY:
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  case NVML_ERROR_INSUFFICIENT_RESOURCES:
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  default:
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
 void checkErrorUR(CUresult Result, const char *Function, int Line,
                   const char *File) {
   if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) {
@@ -63,6 +81,30 @@ void checkErrorUR(CUresult Result, const char *Function, int Line,
   throw mapErrorUR(Result);
 }
 
+void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
+                  const char *File) {
+  if (Result == NVML_SUCCESS) {
+    return;
+  }
+
+  const char *ErrorString = nullptr;
+  ErrorString = nvmlErrorString(Result);
+  std::stringstream SS;
+  SS << "\nUR NVML ERROR:"
+     << "\n\tValue:           " << Result
+     << "\n\tDescription:     " << ErrorString
+     << "\n\tFunction:        " << Function << "\n\tSource Location: " << File
+     << ":" << Line << "\n";
+  logger::error("{}", SS.str());
+
+  if (std::getenv("PI_CUDA_ABORT") != nullptr ||
+      std::getenv("UR_CUDA_ABORT") != nullptr) {
+    std::abort();
+  }
+
+  throw mapErrorUR(Result);
+}
+
 void checkErrorUR(ur_result_t Result, const char *Function, int Line,
                   const char *File) {
   if (Result == UR_RESULT_SUCCESS) {

@@ -10,6 +10,7 @@
 #pragma once
 
 #include <cuda.h>
+#include <nvml.h>
 #include <ur/ur.hpp>
 
 #include <umf/base.h>
@@ -35,6 +36,9 @@ ur_result_t mapErrorUR(CUresult Result);
 void checkErrorUR(CUresult Result, const char *Function, int Line,
                   const char *File);
 
+void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
+                  const char *File);
+
 void checkErrorUR(ur_result_t Result, const char *Function, int Line,
                   const char *File);
 

@@ -18,6 +18,7 @@
 #include "logger/ur_logger.hpp"
 #include "platform.hpp"
 #include "ur_util.hpp"
+#include <nvml.h>
 
 int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
   int value;
@@ -1085,11 +1086,69 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_IP_VERSION:
-  case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS:
-  case UR_DEVICE_INFO_FAN_SPEED:
-  case UR_DEVICE_INFO_MIN_POWER_LIMIT:
-  case UR_DEVICE_INFO_MAX_POWER_LIMIT:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS: {
+    unsigned long long ClocksEventReasons;
+#if (CUDA_VERSION >= 12060)
+    UR_CHECK_ERROR(nvmlDeviceGetCurrentClocksEventReasons(hDevice->getNVML(),
+                                                          &ClocksEventReasons));
+#else
+    UR_CHECK_ERROR(nvmlDeviceGetCurrentClocksThrottleReasons(
+        hDevice->getNVML(), &ClocksEventReasons));
+#endif
+    ur_device_throttle_reasons_flags_t ThrottleReasons = 0;
+    constexpr unsigned long long NVMLThrottleFlags[] = {
+        nvmlClocksThrottleReasonSwPowerCap,
+        nvmlClocksThrottleReasonHwThermalSlowdown ||
+            nvmlClocksThrottleReasonSwThermalSlowdown,
+        nvmlClocksThrottleReasonHwPowerBrakeSlowdown,
+        nvmlClocksThrottleReasonApplicationsClocksSetting};
+
+    constexpr ur_device_throttle_reasons_flags_t UrThrottleFlags[] = {
+        UR_DEVICE_THROTTLE_REASONS_FLAG_POWER_CAP,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_THERMAL_LIMIT,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_PSU_ALERT,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_SW_RANGE};
+
+    for (size_t i = 0;
+         i < sizeof(NVMLThrottleFlags) / sizeof(NVMLThrottleFlags[0]); ++i) {
+      if (ClocksEventReasons & NVMLThrottleFlags[i]) {
+        ThrottleReasons |= UrThrottleFlags[i];
+        ClocksEventReasons &= ~NVMLThrottleFlags[i];
+      }
+    }
+    if (ClocksEventReasons) {
+      ThrottleReasons |= UR_DEVICE_THROTTLE_REASONS_FLAG_OTHER;
+    }
+    return ReturnValue(ThrottleReasons);
+  }
+  case UR_DEVICE_INFO_MIN_POWER_LIMIT:
+  case UR_DEVICE_INFO_MAX_POWER_LIMIT: {
+    unsigned int minLimit, maxLimit;
+    auto NVMLHandle = hDevice->getNVML();
+    auto NVMLError = nvmlDeviceGetPowerManagementLimitConstraints(
+        NVMLHandle, &minLimit, &maxLimit);
+    if (NVMLError == NVML_ERROR_NOT_SUPPORTED) {
+      if (propName == UR_DEVICE_INFO_MAX_POWER_LIMIT) {
+        UR_CHECK_ERROR(
+            nvmlDeviceGetPowerManagementLimit(NVMLHandle, &maxLimit));
+        return ReturnValue(static_cast<int32_t>(maxLimit));
+      } else if (propName == UR_DEVICE_INFO_MIN_POWER_LIMIT) {
+        return ReturnValue(static_cast<int32_t>(-1));
+      }
+    }
+    if (propName == UR_DEVICE_INFO_MAX_POWER_LIMIT) {
+      return ReturnValue(static_cast<int32_t>(maxLimit));
+    } else if (propName == UR_DEVICE_INFO_MIN_POWER_LIMIT) {
+      return ReturnValue(static_cast<int32_t>(minLimit));
+    }
+    break;
+  }
+  case UR_DEVICE_INFO_FAN_SPEED: {
+    unsigned int Speed;
+    UR_CHECK_ERROR(nvmlDeviceGetFanSpeed(hDevice->getNVML(), &Speed));
+    return ReturnValue(static_cast<int32_t>(Speed));
+  }
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return ReturnValue(
         static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));

@@ -36,13 +36,14 @@ struct ur_device_handle_t_ {
   int MaxChosenLocalMem{0};
   bool MaxLocalMemSizeChosen{false};
   uint32_t NumComputeUnits{0};
+  std::once_flag NVMLInitFlag;
+  std::optional<nvmlDevice_t> NVMLDevice;
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
                       ur_platform_handle_t platform, uint32_t DevIndex)
       : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
         Platform(platform), DeviceIndex{DevIndex} {
-
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
         cuDevice));
@@ -102,11 +103,28 @@ struct ur_device_handle_t_ {
     if (MemoryProviderShared) {
       umfMemoryProviderDestroy(MemoryProviderShared);
     }
+    if (NVMLDevice.has_value()) {
+      UR_CHECK_ERROR(nvmlShutdown());
+    }
     cuDevicePrimaryCtxRelease(CuDevice);
   }
 
   native_type get() const noexcept { return CuDevice; };
 
+  nvmlDevice_t getNVML() {
+    // Initialization happens lazily once per device object. Call to nvmlInit by
+    // different objects will just increase the reference count. Each object's
+    // destructor calls shutdown method, so once there will be no NVML users
+    // left, resources will be released.
+    std::call_once(NVMLInitFlag, [this]() {
+      UR_CHECK_ERROR(nvmlInit());
+      nvmlDevice_t Handle;
+      UR_CHECK_ERROR(nvmlDeviceGetHandleByIndex(DeviceIndex, &Handle));
+      NVMLDevice = Handle;
+    });
+    return NVMLDevice.value();
+  };
+
   CUcontext getNativeContext() const noexcept { return CuContext; };
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }

@@ -29,4 +29,6 @@ target_include_directories(test-adapter-cuda PRIVATE
     ${PROJECT_SOURCE_DIR}/source/adapters/cuda
 )
 
-target_link_libraries(test-adapter-cuda PRIVATE cudadrv ${PROJECT_NAME}::umf)
+find_package(CUDAToolkit 10.1 REQUIRED)
+
+target_link_libraries(test-adapter-cuda PRIVATE cudadrv CUDA::nvml ${PROJECT_NAME}::umf)