@@ -113,11 +113,6 @@ template <typename T> class stack_t
113
113
return src_;
114
114
}
115
115
116
- const T *get_src_const_ptr () const
117
- {
118
- return src_;
119
- }
120
-
121
116
size_t get_size () const
122
117
{
123
118
return size_;
@@ -150,11 +145,6 @@ template <typename T> class stack_strided_t
150
145
return src_;
151
146
}
152
147
153
- const T *get_src_const_ptr () const
154
- {
155
- return src_;
156
- }
157
-
158
148
size_t get_size () const
159
149
{
160
150
return size_;
@@ -247,16 +237,16 @@ inclusive_scan_base_step(sycl::queue &exec_q,
247
237
cgh.parallel_for <KernelName>(ndRange, [=, slm_iscan_tmp =
248
238
std::move (slm_iscan_tmp)](
249
239
sycl::nd_item<1 > it) {
250
- size_t gid = it.get_global_id (0 );
251
- size_t lid = it.get_local_id (0 );
240
+ const size_t gid = it.get_global_id (0 );
241
+ const size_t lid = it.get_local_id (0 );
252
242
253
- size_t iter_gid = gid / (acc_groups * wg_size);
254
- size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
243
+ const size_t iter_gid = gid / (acc_groups * wg_size);
244
+ const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
255
245
256
246
std::array<outputT, n_wi> local_iscan;
257
247
258
248
size_t i = chunk_gid * n_wi;
259
- auto iter_offsets = iter_indexer (iter_gid);
249
+ const auto & iter_offsets = iter_indexer (iter_gid);
260
250
const auto &inp_iter_offset = iter_offsets.get_first_offset ();
261
251
const auto &out_iter_offset = iter_offsets.get_second_offset ();
262
252
@@ -377,7 +367,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
377
367
378
368
sycl::event dependent_event = inc_scan_phase1_ev;
379
369
if (n_groups > 1 ) {
380
- auto chunk_size = wg_size * n_wi;
370
+ const size_t chunk_size = wg_size * n_wi;
381
371
382
372
// how much of temporary allocation do we need
383
373
size_t n_groups_ = n_groups;
@@ -407,7 +397,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
407
397
size_t size_to_update = n_elems;
408
398
while (n_groups_ > 1 ) {
409
399
410
- size_t src_size = n_groups_ - 1 ;
400
+ const size_t src_size = n_groups_ - 1 ;
411
401
dependent_event =
412
402
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
413
403
NoOpIndexerT, NoOpIndexerT,
@@ -426,19 +416,19 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
426
416
for (size_t reverse_stack_id = 0 ; reverse_stack_id < stack.size ();
427
417
++reverse_stack_id)
428
418
{
429
- auto stack_id = stack.size () - 1 - reverse_stack_id;
419
+ const size_t stack_id = stack.size () - 1 - reverse_stack_id;
430
420
431
- auto stack_elem = stack[stack_id];
421
+ const auto & stack_elem = stack[stack_id];
432
422
outputT *src = stack_elem.get_src_ptr ();
433
- size_t src_size = stack_elem.get_size ();
423
+ const size_t src_size = stack_elem.get_size ();
434
424
outputT *local_scans = stack_elem.get_local_scans_ptr ();
435
425
436
426
// output[ chunk_size * (i + 1) + j] += temp[i]
437
427
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
438
428
cgh.depends_on (dependent_event);
439
429
440
430
constexpr nwiT updates_per_wi = n_wi;
441
- size_t n_items = ceiling_quotient<size_t >(src_size, n_wi);
431
+ const size_t n_items = ceiling_quotient<size_t >(src_size, n_wi);
442
432
443
433
using UpdateKernelName =
444
434
class inclusive_scan_1d_iter_chunk_update_krn <
@@ -448,12 +438,12 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
448
438
cgh.parallel_for <UpdateKernelName>(
449
439
{n_items}, [chunk_size, src, src_size, local_scans, scan_op,
450
440
identity](auto wiid) {
451
- auto gid = n_wi * wiid[0 ];
441
+ const size_t gid = n_wi * wiid[0 ];
452
442
#pragma unroll
453
- for (auto i = 0 ; i < updates_per_wi; ++i) {
454
- auto src_id = gid + i;
443
+ for (size_t i = 0 ; i < updates_per_wi; ++i) {
444
+ const size_t src_id = gid + i;
455
445
if (src_id < src_size) {
456
- auto scan_id = (src_id / chunk_size);
446
+ const size_t scan_id = (src_id / chunk_size);
457
447
src[src_id] =
458
448
(scan_id > 0 )
459
449
? scan_op (src[src_id],
@@ -511,7 +501,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
511
501
const sycl::device &dev = q.get_device ();
512
502
if (dev.has (sycl::aspect::cpu)) {
513
503
constexpr nwiT n_wi_for_cpu = 8 ;
514
- size_t wg_size = 256 ;
504
+ const size_t wg_size = 256 ;
515
505
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
516
506
transformerT, AccumulateOpT,
517
507
include_initial>(
@@ -520,7 +510,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
520
510
}
521
511
else {
522
512
constexpr nwiT n_wi_for_gpu = 4 ;
523
- size_t wg_size = 256 ;
513
+ const size_t wg_size = 256 ;
524
514
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
525
515
transformerT, AccumulateOpT,
526
516
include_initial>(
@@ -586,13 +576,13 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
586
576
587
577
sycl::event dependent_event = inc_scan_phase1_ev;
588
578
if (acc_groups > 1 ) {
589
- auto chunk_size = wg_size * n_wi;
579
+ const size_t chunk_size = wg_size * n_wi;
590
580
591
581
// how much of temporary allocation do we need
592
582
size_t acc_groups_ = acc_groups;
593
583
size_t temp_size = 0 ;
594
584
while (acc_groups_ > 1 ) {
595
- const auto this_size = (acc_groups_ - 1 );
585
+ const size_t this_size = (acc_groups_ - 1 );
596
586
temp_size += this_size;
597
587
acc_groups_ = ceiling_quotient<size_t >(this_size, chunk_size);
598
588
}
@@ -683,16 +673,16 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
683
673
for (size_t reverse_stack_id = 0 ; reverse_stack_id < stack.size () - 1 ;
684
674
++reverse_stack_id)
685
675
{
686
- auto stack_id = stack.size () - 1 - reverse_stack_id;
676
+ const size_t stack_id = stack.size () - 1 - reverse_stack_id;
687
677
688
- auto stack_elem = stack[stack_id];
678
+ const auto & stack_elem = stack[stack_id];
689
679
outputT *src = stack_elem.get_src_ptr ();
690
680
size_t src_size = stack_elem.get_size ();
691
681
outputT *local_scans = stack_elem.get_local_scans_ptr ();
692
682
size_t local_stride = stack_elem.get_local_stride ();
693
683
694
684
constexpr nwiT updates_per_wi = n_wi;
695
- size_t update_nelems =
685
+ const size_t update_nelems =
696
686
ceiling_quotient<size_t >(src_size, updates_per_wi);
697
687
698
688
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
@@ -739,14 +729,14 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
739
729
740
730
// last stack element is always directly to output
741
731
{
742
- auto stack_elem = stack[0 ];
732
+ const auto & stack_elem = stack[0 ];
743
733
outputT *src = stack_elem.get_src_ptr ();
744
- size_t src_size = stack_elem.get_size ();
734
+ const size_t src_size = stack_elem.get_size ();
745
735
outputT *local_scans = stack_elem.get_local_scans_ptr ();
746
- size_t local_stride = stack_elem.get_local_stride ();
736
+ const size_t local_stride = stack_elem.get_local_stride ();
747
737
748
738
constexpr nwiT updates_per_wi = n_wi;
749
- size_t update_nelems =
739
+ const size_t update_nelems =
750
740
ceiling_quotient<size_t >(src_size, updates_per_wi);
751
741
752
742
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
@@ -864,7 +854,7 @@ accumulate_strided_impl(sycl::queue &q,
864
854
sycl::event comp_ev;
865
855
if (dev.has (sycl::aspect::cpu)) {
866
856
constexpr nwiT n_wi_for_cpu = 8 ;
867
- size_t wg_size = 256 ;
857
+ const size_t wg_size = 256 ;
868
858
comp_ev =
869
859
inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
870
860
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -875,7 +865,7 @@ accumulate_strided_impl(sycl::queue &q,
875
865
}
876
866
else {
877
867
constexpr nwiT n_wi_for_gpu = 4 ;
878
- size_t wg_size = 256 ;
868
+ const size_t wg_size = 256 ;
879
869
comp_ev =
880
870
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
881
871
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -920,7 +910,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
920
910
const sycl::device &dev = q.get_device ();
921
911
if (dev.has (sycl::aspect::cpu)) {
922
912
constexpr nwiT n_wi_for_cpu = 8 ;
923
- size_t wg_size = 256 ;
913
+ const size_t wg_size = 256 ;
924
914
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
925
915
NoOpIndexerT, transformerT,
926
916
AccumulateOpT, include_initial>(
@@ -929,7 +919,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
929
919
}
930
920
else {
931
921
constexpr nwiT n_wi_for_gpu = 4 ;
932
- size_t wg_size = 256 ;
922
+ const size_t wg_size = 256 ;
933
923
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
934
924
NoOpIndexerT, transformerT,
935
925
AccumulateOpT, include_initial>(
@@ -1028,7 +1018,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
1028
1018
sycl::event comp_ev;
1029
1019
if (dev.has (sycl::aspect::cpu)) {
1030
1020
constexpr nwiT n_wi_for_cpu = 8 ;
1031
- size_t wg_size = 256 ;
1021
+ const size_t wg_size = 256 ;
1032
1022
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
1033
1023
StridedIndexerT, transformerT,
1034
1024
AccumulateOpT, include_initial>(
@@ -1037,7 +1027,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
1037
1027
}
1038
1028
else {
1039
1029
constexpr nwiT n_wi_for_gpu = 4 ;
1040
- size_t wg_size = 256 ;
1030
+ const size_t wg_size = 256 ;
1041
1031
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
1042
1032
StridedIndexerT, transformerT,
1043
1033
AccumulateOpT, include_initial>(
0 commit comments