Skip to content

Commit 89d6850

Browse files
Make two-offset instances const references to avoid copying.
Gor rid of get_src_const_ptr unused methods in stack_t structs. Replaced auto with size_t as appropriate. Added const to make compiler analysis easier (and faster).
1 parent 5fd506c commit 89d6850

File tree

1 file changed

+32
-42
lines changed

1 file changed

+32
-42
lines changed

dpctl/tensor/libtensor/include/kernels/accumulators.hpp

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,6 @@ template <typename T> class stack_t
113113
return src_;
114114
}
115115

116-
const T *get_src_const_ptr() const
117-
{
118-
return src_;
119-
}
120-
121116
size_t get_size() const
122117
{
123118
return size_;
@@ -150,11 +145,6 @@ template <typename T> class stack_strided_t
150145
return src_;
151146
}
152147

153-
const T *get_src_const_ptr() const
154-
{
155-
return src_;
156-
}
157-
158148
size_t get_size() const
159149
{
160150
return size_;
@@ -247,16 +237,16 @@ inclusive_scan_base_step(sycl::queue &exec_q,
247237
cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
248238
std::move(slm_iscan_tmp)](
249239
sycl::nd_item<1> it) {
250-
size_t gid = it.get_global_id(0);
251-
size_t lid = it.get_local_id(0);
240+
const size_t gid = it.get_global_id(0);
241+
const size_t lid = it.get_local_id(0);
252242

253-
size_t iter_gid = gid / (acc_groups * wg_size);
254-
size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
243+
const size_t iter_gid = gid / (acc_groups * wg_size);
244+
const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
255245

256246
std::array<outputT, n_wi> local_iscan;
257247

258248
size_t i = chunk_gid * n_wi;
259-
auto iter_offsets = iter_indexer(iter_gid);
249+
const auto &iter_offsets = iter_indexer(iter_gid);
260250
const auto &inp_iter_offset = iter_offsets.get_first_offset();
261251
const auto &out_iter_offset = iter_offsets.get_second_offset();
262252

@@ -377,7 +367,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
377367

378368
sycl::event dependent_event = inc_scan_phase1_ev;
379369
if (n_groups > 1) {
380-
auto chunk_size = wg_size * n_wi;
370+
const size_t chunk_size = wg_size * n_wi;
381371

382372
// how much of temporary allocation do we need
383373
size_t n_groups_ = n_groups;
@@ -407,7 +397,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
407397
size_t size_to_update = n_elems;
408398
while (n_groups_ > 1) {
409399

410-
size_t src_size = n_groups_ - 1;
400+
const size_t src_size = n_groups_ - 1;
411401
dependent_event =
412402
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
413403
NoOpIndexerT, NoOpIndexerT,
@@ -426,19 +416,19 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
426416
for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
427417
++reverse_stack_id)
428418
{
429-
auto stack_id = stack.size() - 1 - reverse_stack_id;
419+
const size_t stack_id = stack.size() - 1 - reverse_stack_id;
430420

431-
auto stack_elem = stack[stack_id];
421+
const auto &stack_elem = stack[stack_id];
432422
outputT *src = stack_elem.get_src_ptr();
433-
size_t src_size = stack_elem.get_size();
423+
const size_t src_size = stack_elem.get_size();
434424
outputT *local_scans = stack_elem.get_local_scans_ptr();
435425

436426
// output[ chunk_size * (i + 1) + j] += temp[i]
437427
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
438428
cgh.depends_on(dependent_event);
439429

440430
constexpr nwiT updates_per_wi = n_wi;
441-
size_t n_items = ceiling_quotient<size_t>(src_size, n_wi);
431+
const size_t n_items = ceiling_quotient<size_t>(src_size, n_wi);
442432

443433
using UpdateKernelName =
444434
class inclusive_scan_1d_iter_chunk_update_krn<
@@ -448,12 +438,12 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
448438
cgh.parallel_for<UpdateKernelName>(
449439
{n_items}, [chunk_size, src, src_size, local_scans, scan_op,
450440
identity](auto wiid) {
451-
auto gid = n_wi * wiid[0];
441+
const size_t gid = n_wi * wiid[0];
452442
#pragma unroll
453-
for (auto i = 0; i < updates_per_wi; ++i) {
454-
auto src_id = gid + i;
443+
for (size_t i = 0; i < updates_per_wi; ++i) {
444+
const size_t src_id = gid + i;
455445
if (src_id < src_size) {
456-
auto scan_id = (src_id / chunk_size);
446+
const size_t scan_id = (src_id / chunk_size);
457447
src[src_id] =
458448
(scan_id > 0)
459449
? scan_op(src[src_id],
@@ -511,7 +501,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
511501
const sycl::device &dev = q.get_device();
512502
if (dev.has(sycl::aspect::cpu)) {
513503
constexpr nwiT n_wi_for_cpu = 8;
514-
size_t wg_size = 256;
504+
const size_t wg_size = 256;
515505
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
516506
transformerT, AccumulateOpT,
517507
include_initial>(
@@ -520,7 +510,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
520510
}
521511
else {
522512
constexpr nwiT n_wi_for_gpu = 4;
523-
size_t wg_size = 256;
513+
const size_t wg_size = 256;
524514
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
525515
transformerT, AccumulateOpT,
526516
include_initial>(
@@ -586,13 +576,13 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
586576

587577
sycl::event dependent_event = inc_scan_phase1_ev;
588578
if (acc_groups > 1) {
589-
auto chunk_size = wg_size * n_wi;
579+
const size_t chunk_size = wg_size * n_wi;
590580

591581
// how much of temporary allocation do we need
592582
size_t acc_groups_ = acc_groups;
593583
size_t temp_size = 0;
594584
while (acc_groups_ > 1) {
595-
const auto this_size = (acc_groups_ - 1);
585+
const size_t this_size = (acc_groups_ - 1);
596586
temp_size += this_size;
597587
acc_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
598588
}
@@ -683,16 +673,16 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
683673
for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size() - 1;
684674
++reverse_stack_id)
685675
{
686-
auto stack_id = stack.size() - 1 - reverse_stack_id;
676+
const size_t stack_id = stack.size() - 1 - reverse_stack_id;
687677

688-
auto stack_elem = stack[stack_id];
678+
const auto &stack_elem = stack[stack_id];
689679
outputT *src = stack_elem.get_src_ptr();
690680
size_t src_size = stack_elem.get_size();
691681
outputT *local_scans = stack_elem.get_local_scans_ptr();
692682
size_t local_stride = stack_elem.get_local_stride();
693683

694684
constexpr nwiT updates_per_wi = n_wi;
695-
size_t update_nelems =
685+
const size_t update_nelems =
696686
ceiling_quotient<size_t>(src_size, updates_per_wi);
697687

698688
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
@@ -739,14 +729,14 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
739729

740730
// last stack element is always directly to output
741731
{
742-
auto stack_elem = stack[0];
732+
const auto &stack_elem = stack[0];
743733
outputT *src = stack_elem.get_src_ptr();
744-
size_t src_size = stack_elem.get_size();
734+
const size_t src_size = stack_elem.get_size();
745735
outputT *local_scans = stack_elem.get_local_scans_ptr();
746-
size_t local_stride = stack_elem.get_local_stride();
736+
const size_t local_stride = stack_elem.get_local_stride();
747737

748738
constexpr nwiT updates_per_wi = n_wi;
749-
size_t update_nelems =
739+
const size_t update_nelems =
750740
ceiling_quotient<size_t>(src_size, updates_per_wi);
751741

752742
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
@@ -864,7 +854,7 @@ accumulate_strided_impl(sycl::queue &q,
864854
sycl::event comp_ev;
865855
if (dev.has(sycl::aspect::cpu)) {
866856
constexpr nwiT n_wi_for_cpu = 8;
867-
size_t wg_size = 256;
857+
const size_t wg_size = 256;
868858
comp_ev =
869859
inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
870860
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -875,7 +865,7 @@ accumulate_strided_impl(sycl::queue &q,
875865
}
876866
else {
877867
constexpr nwiT n_wi_for_gpu = 4;
878-
size_t wg_size = 256;
868+
const size_t wg_size = 256;
879869
comp_ev =
880870
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
881871
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -920,7 +910,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
920910
const sycl::device &dev = q.get_device();
921911
if (dev.has(sycl::aspect::cpu)) {
922912
constexpr nwiT n_wi_for_cpu = 8;
923-
size_t wg_size = 256;
913+
const size_t wg_size = 256;
924914
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
925915
NoOpIndexerT, transformerT,
926916
AccumulateOpT, include_initial>(
@@ -929,7 +919,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
929919
}
930920
else {
931921
constexpr nwiT n_wi_for_gpu = 4;
932-
size_t wg_size = 256;
922+
const size_t wg_size = 256;
933923
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
934924
NoOpIndexerT, transformerT,
935925
AccumulateOpT, include_initial>(
@@ -1028,7 +1018,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
10281018
sycl::event comp_ev;
10291019
if (dev.has(sycl::aspect::cpu)) {
10301020
constexpr nwiT n_wi_for_cpu = 8;
1031-
size_t wg_size = 256;
1021+
const size_t wg_size = 256;
10321022
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
10331023
StridedIndexerT, transformerT,
10341024
AccumulateOpT, include_initial>(
@@ -1037,7 +1027,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
10371027
}
10381028
else {
10391029
constexpr nwiT n_wi_for_gpu = 4;
1040-
size_t wg_size = 256;
1030+
const size_t wg_size = 256;
10411031
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
10421032
StridedIndexerT, transformerT,
10431033
AccumulateOpT, include_initial>(

0 commit comments

Comments
 (0)