Skip to content

Commit 34ae129

Browse files
Merge pull request #1872 from IntelPython/add-empty-task-submission
Add alternative implementation of device timer to SyclTimer class
2 parents 2a4714f + 7752078 commit 34ae129

File tree

4 files changed

+236
-19
lines changed

4 files changed

+236
-19
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88

99
### Added
1010

11-
### Change
11+
### Changed
1212

1313
* Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)
1414
* Improved performance of copying operation to C-/F-contig array, with optimization for batch of square matrices [gh-1850](https://github.com/IntelPython/dpctl/pull/1850)
1515
* Improved performance of `tensor.argsort` function for all types [gh-1859](https://github.com/IntelPython/dpctl/pull/1859)
1616
* Improved performance of `tensor.sort` and `tensor.argsort` for short arrays in the range [16, 64] elements [gh-1866](https://github.com/IntelPython/dpctl/pull/1866)
1717
* Implement radix sort algorithm to be used in `dpt.sort` and `dpt.argsort` [gh-1867](https://github.com/IntelPython/dpctl/pull/1867)
18+
* Extended `dpctl.SyclTimer` with `device_timer` keyword, implementing different methods of collecting device times [gh-1872](https://github.com/IntelPython/dpctl/pull/1872)
1819

1920
### Fixed
2021
* Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877)

dpctl/_sycl_timer.py

Lines changed: 113 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,47 @@ def device_dt(self):
4444
return self._device_dt
4545

4646

47+
class BaseDeviceTimer:
48+
__slots__ = ["queue"]
49+
50+
def __init__(self, sycl_queue):
51+
if not isinstance(sycl_queue, SyclQueue):
52+
raise TypeError(f"Expected type SyclQueue, got {type(sycl_queue)}")
53+
self.queue = sycl_queue
54+
55+
56+
class QueueBarrierDeviceTimer(BaseDeviceTimer):
57+
__slots__ = []
58+
59+
def __init__(self, sycl_queue):
60+
super(QueueBarrierDeviceTimer, self).__init__(sycl_queue)
61+
62+
def get_event(self):
63+
return self.queue.submit_barrier()
64+
65+
66+
class OrderManagerDeviceTimer(BaseDeviceTimer):
67+
__slots__ = ["_order_manager", "_submit_empty_task_fn"]
68+
69+
def __init__(self, sycl_queue):
70+
import dpctl.utils._seq_order_keeper as s_ok
71+
from dpctl.utils import SequentialOrderManager as seq_om
72+
73+
super(OrderManagerDeviceTimer, self).__init__(sycl_queue)
74+
self._order_manager = seq_om[self.queue]
75+
self._submit_empty_task_fn = s_ok._submit_empty_task
76+
77+
def get_event(self):
78+
ev = self._submit_empty_task_fn(
79+
sycl_queue=self.queue, depends=self._order_manager.submitted_events
80+
)
81+
self._order_manager.add_event_pair(ev, ev)
82+
return ev
83+
84+
4785
class SyclTimer:
4886
"""
49-
Context to measure device time and host wall-time of execution
50-
of commands submitted to :class:`dpctl.SyclQueue`.
87+
Context to time execution of tasks submitted to :class:`dpctl.SyclQueue`.
5188
5289
:Example:
5390
.. code-block:: python
@@ -58,40 +95,81 @@ class SyclTimer:
5895
q = dpctl.SyclQueue(property="enable_profiling")
5996
6097
# create the timer
61-
milliseconds_sc = 1e-3
98+
milliseconds_sc = 1e3
6299
timer = dpctl.SyclTimer(time_scale = milliseconds_sc)
63100
101+
untimed_code_block_1
64102
# use the timer
65103
with timer(queue=q):
66-
code_block1
104+
timed_code_block1
105+
106+
untimed_code_block_2
67107
68108
# use the timer
69109
with timer(queue=q):
70-
code_block2
110+
timed_code_block2
111+
112+
untimed_code_block_3
71113
72114
# retrieve elapsed times in milliseconds
73115
wall_dt, device_dt = timer.dt
74116
75117
.. note::
76-
The timer submits barriers to the queue at the entrance and the
118+
The timer submits tasks to the queue at the entrance and the
77119
exit of the context and uses profiling information from events
78120
associated with these submissions to perform the timing. Thus
79121
:class:`dpctl.SyclTimer` requires the queue with ``"enable_profiling"``
80122
property. In order to be able to collect the profiling information,
81-
the ``dt`` property ensures that both submitted barriers complete their
82-
execution and thus effectively synchronizes the queue.
123+
the ``dt`` property ensures that both tasks submitted by the timer
124+
complete their execution and thus effectively synchronizes the queue.
125+
126+
Execution of the above example results in the following task graph,
127+
where each group of tasks is ordered after the one preceding it,
128+
``[tasks_of_untimed_block1]``, ``[timer_fence_start_task]``,
129+
``[tasks_of_timed_block1]``, ``[timer_fence_finish_task]``,
130+
``[tasks_of_untimed_block2]``, ``[timer_fence_start_task]``,
131+
``[tasks_of_timed_block2]``, ``[timer_fence_finish_task]``,
132+
``[tasks_of_untimed_block3]``.
133+
134+
``device_timer`` keyword argument controls the type of tasks submitted.
135+
With ``"queue_barrier"`` value, queue barrier tasks are used. With
136+
``"order_manager"`` value, a single empty body task is inserted
137+
and order manager (used by all `dpctl.tensor` operations) is used to
138+
order these tasks so that they fence operations performed within
139+
timer's context.
140+
141+
Timing offloading operations that do not use the order manager with
142+
the timer that uses ``"order_manager"`` as ``device_timer`` value
143+
will be misleading becaused the tasks submitted by the timer will not
144+
be ordered with respect to tasks we intend to time.
145+
146+
Note, that host timer effectively measures the time of task
147+
submissions. To measure host timer wall-time that includes execution
148+
of submitted tasks, make sure to include synchronization point in
149+
the timed block.
150+
151+
:Example:
152+
.. code-block:: python
153+
154+
with timer(q):
155+
timed_block
156+
q.wait()
83157
84158
Args:
85159
host_timer (callable, optional):
86160
A callable such that host_timer() returns current
87161
host time in seconds.
88162
Default: :py:func:`timeit.default_timer`.
163+
device_timer (Literal["queue_barrier", "order_manager"], optional):
164+
Device timing method. Default: "queue_barrier".
89165
time_scale (Union[int, float], optional):
90-
Ratio of the unit of time of interest and one second.
166+
Ratio of one second and the unit of time-scale of interest.
91167
Default: ``1``.
92168
"""
93169

94-
def __init__(self, host_timer=timeit.default_timer, time_scale=1):
170+
def __init__(
171+
self, host_timer=timeit.default_timer, device_timer=None, time_scale=1
172+
):
95173
"""
96174
Create new instance of :class:`.SyclTimer`.
97175
@@ -100,6 +178,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1):
100178
A function that takes no arguments and returns a value
101179
measuring time.
102180
Default: :meth:`timeit.default_timer`.
181+
device_timer (Literal["queue_barrier", "order_manager"], optional):
182+
Device timing method. Default: "queue_barrier"
103183
time_scale (Union[int, float], optional):
104184
Scaling factor applied to durations measured by
105185
the host_timer. Default: ``1``.
@@ -109,11 +189,26 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1):
109189
self.queue = None
110190
self.host_times = []
111191
self.bracketing_events = []
192+
self._context_data = list()
193+
if device_timer is None:
194+
device_timer = "queue_barrier"
195+
if device_timer == "queue_barrier":
196+
self._device_timer_class = QueueBarrierDeviceTimer
197+
elif device_timer == "order_manager":
198+
self._device_timer_class = OrderManagerDeviceTimer
199+
else:
200+
raise ValueError(
201+
"Supported values for device_timer keyword are "
202+
"'queue_barrier', 'order_manager', got "
203+
f"'{device_timer}'"
204+
)
205+
self._device_timer = None
112206

113207
def __call__(self, queue=None):
114208
if isinstance(queue, SyclQueue):
115209
if queue.has_enable_profiling:
116210
self.queue = queue
211+
self._device_timer = self._device_timer_class(queue)
117212
else:
118213
raise ValueError(
119214
"The given queue was not created with the "
@@ -127,17 +222,17 @@ def __call__(self, queue=None):
127222
return self
128223

129224
def __enter__(self):
130-
self._event_start = self.queue.submit_barrier()
131-
self._host_start = self.timer()
225+
_event_start = self._device_timer.get_event()
226+
_host_start = self.timer()
227+
self._context_data.append((_event_start, _host_start))
132228
return self
133229

134230
def __exit__(self, *args):
135-
self.host_times.append((self._host_start, self.timer()))
136-
self.bracketing_events.append(
137-
(self._event_start, self.queue.submit_barrier())
138-
)
139-
del self._event_start
140-
del self._host_start
231+
_event_end = self._device_timer.get_event()
232+
_host_end = self.timer()
233+
_event_start, _host_start = self._context_data.pop()
234+
self.host_times.append((_host_start, _host_end))
235+
self.bracketing_events.append((_event_start, _event_end))
141236

142237
@property
143238
def dt(self):

dpctl/tests/test_sycl_timer.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Data Parallel Control (dpctl)
2+
#
3+
# Copyright 2020-2024 Intel Corporation
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import time
18+
19+
import pytest
20+
21+
import dpctl
22+
import dpctl.tensor as dpt
23+
24+
25+
@pytest.fixture
26+
def profiling_queue():
27+
try:
28+
q = dpctl.SyclQueue(property="enable_profiling")
29+
except dpctl.SyclQueueCreationError:
30+
pytest.skip(
31+
"Could not created profiling queue " "for default-selected device"
32+
)
33+
return q
34+
35+
36+
@pytest.mark.parametrize(
37+
"device_timer", [None, "queue_barrier", "order_manager"]
38+
)
39+
def test_sycl_timer_queue_barrier(profiling_queue, device_timer):
40+
dev = dpt.Device.create_device(profiling_queue)
41+
42+
timer = dpctl.SyclTimer(
43+
host_timer=time.perf_counter, device_timer=device_timer, time_scale=1e3
44+
)
45+
46+
with timer(dev.sycl_queue):
47+
x = dpt.linspace(0, 1, num=10**6, device=dev)
48+
y = 3.0 - dpt.square(x - 0.5)
49+
z = dpt.sort(y)
50+
res1 = z[-1]
51+
res2 = dpt.max(y)
52+
53+
host_dt, device_dt = timer.dt
54+
55+
assert dpt.all(res1 == res2)
56+
assert host_dt > 0
57+
assert device_dt > 0
58+
59+
60+
def test_sycl_timer_accumulation(profiling_queue):
61+
q = profiling_queue
62+
63+
timer = dpctl.SyclTimer(
64+
host_timer=time.perf_counter,
65+
device_timer="order_manager",
66+
time_scale=1e3,
67+
)
68+
69+
# initial condition
70+
x = dpt.linspace(0, 1, num=10**6, sycl_queue=q)
71+
72+
aitkens_data = [
73+
x,
74+
]
75+
76+
# 16 iterations of Aitken's accelerated Newton's method
77+
# x <- x - f(x)/f'(x) for f(x) = x - cos(x)
78+
for _ in range(16):
79+
# only time Newton step
80+
with timer(q):
81+
s = dpt.sin(x)
82+
x = (dpt.cos(x) + x * s) / (1 + s)
83+
aitkens_data.append(x)
84+
aitkens_data = aitkens_data[-3:]
85+
if len(aitkens_data) == 3:
86+
# apply Aitkens acceleration
87+
d1 = aitkens_data[-1] - aitkens_data[-2]
88+
d2 = aitkens_data[-2] - aitkens_data[-3]
89+
if not dpt.any(d1 == d2):
90+
x = aitkens_data[-1] - dpt.square(d1) / (d1 - d2)
91+
92+
# Total time for 16 iterations
93+
dev_dt = timer.dt.device_dt
94+
assert dev_dt > 0
95+
96+
# check convergence
97+
assert dpt.max(x) - dpt.min(x) < 1e-5
98+
99+
100+
def test_sycl_timer_validation():
101+
with pytest.raises(ValueError):
102+
dpctl.SyclTimer(device_timer="invalid")
103+
104+
timer = dpctl.SyclTimer()
105+
mock_queue = Ellipsis
106+
107+
with pytest.raises(TypeError):
108+
timer(mock_queue)

dpctl/utils/src/order_keeper.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,17 @@ PYBIND11_MODULE(_seq_order_keeper, m)
2626
&SequentialOrder::add_to_submitted_events)
2727
.def("wait", &SequentialOrder::wait,
2828
py::call_guard<py::gil_scoped_release>());
29+
30+
auto submit_empty_task_fn =
31+
[](sycl::queue &exec_q,
32+
const std::vector<sycl::event> &depends) -> sycl::event {
33+
return exec_q.submit([&](sycl::handler &cgh) {
34+
cgh.depends_on(depends);
35+
cgh.single_task([]() {
36+
// empty body
37+
});
38+
});
39+
};
40+
m.def("_submit_empty_task", submit_empty_task_fn, py::arg("sycl_queue"),
41+
py::arg("depends") = py::list());
2942
}

0 commit comments

Comments
 (0)