@@ -232,7 +232,7 @@ class __SYCL_EXPORT handler {
232
232
void saveCodeLoc (detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
233
233
234
234
// / Stores the given \param Event to the \param Queue.
235
- // / Even thought MQueue is a field of handler, the method addEvent() of
235
+ // / Even though MQueue is a field of handler, the method addEvent() of
236
236
// / queue_impl class cannot be called inside this handler.hpp file
237
237
// / as queue_impl is incomplete class for handler.
238
238
static void addEventToQueue (shared_ptr_class<detail::queue_impl> Queue,
@@ -814,7 +814,7 @@ class __SYCL_EXPORT handler {
814
814
// / user's lambda function \param KernelFunc and does one iteration of
815
815
// / reduction of elements in each of work-groups.
816
816
// / This version uses tree-reduction algorithm to reduce elements in each
817
- // / of work-groups. At the end of each work-groups the partial sum is written
817
+ // / of work-groups. At the end of each work-group the partial sum is written
818
818
// / to a global buffer.
819
819
// /
820
820
// / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
@@ -827,21 +827,22 @@ class __SYCL_EXPORT handler {
827
827
size_t NWorkGroups = Range.get_group_range ().size ();
828
828
829
829
bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0 ;
830
- size_t InefficientCase = ( IsUnderLoaded || ( WGSize & (WGSize - 1 ))) ? 1 : 0 ;
830
+ bool IsEfficientCase = ! IsUnderLoaded && (( WGSize & (WGSize - 1 )) == 0 ) ;
831
831
832
832
bool IsUpdateOfUserAcc =
833
833
Reduction::accessor_mode == access::mode::read_write &&
834
834
NWorkGroups == 1 ;
835
835
836
836
// Use local memory to reduce elements in work-groups into 0-th element.
837
837
// If WGSize is not power of two, then WGSize+1 elements are allocated.
838
- // The additional last element is used to catch reduce elements that could
839
- // otherwise be lost in the tree-reduction algorithm used in the kernel.
840
- auto LocalReds = Redu.getReadWriteLocalAcc (WGSize + InefficientCase, *this );
838
+ // The additional last element is used to catch elements that could
839
+ // otherwise be lost in the tree-reduction algorithm.
840
+ size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
841
+ auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
841
842
842
843
auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, 0 , *this );
843
844
auto ReduIdentity = Redu.getIdentity ();
844
- if (!InefficientCase ) {
845
+ if (IsEfficientCase ) {
845
846
// Efficient case: work-groups are fully loaded and work-group size
846
847
// is power of two.
847
848
parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
@@ -863,7 +864,7 @@ class __SYCL_EXPORT handler {
863
864
NDIt.barrier ();
864
865
}
865
866
866
- // Compute the the partial sum/reduction for the work-group.
867
+ // Compute the partial sum/reduction for the work-group.
867
868
if (LID == 0 )
868
869
Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
869
870
IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
@@ -904,7 +905,7 @@ class __SYCL_EXPORT handler {
904
905
PrevStep = CurStep;
905
906
}
906
907
907
- // Compute the the partial sum/reduction for the work-group.
908
+ // Compute the partial sum/reduction for the work-group.
908
909
if (LID == 0 ) {
909
910
auto GrID = NDIt.get_group_linear_id ();
910
911
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
@@ -918,7 +919,7 @@ class __SYCL_EXPORT handler {
918
919
// / Implements a command group function that enqueues a kernel that does one
919
920
// / iteration of reduction of elements in each of work-groups.
920
921
// / This version uses tree-reduction algorithm to reduce elements in each
921
- // / of work-groups. At the end of each work-groups the partial sum is written
922
+ // / of work-groups. At the end of each work-group the partial sum is written
922
923
// / to a global buffer.
923
924
// /
924
925
// / Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
@@ -932,17 +933,18 @@ class __SYCL_EXPORT handler {
932
933
// size may be not power of those. Those two cases considered inefficient
933
934
// as they require additional code and checks in the kernel.
934
935
bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
935
- size_t InefficientCase = ( IsUnderLoaded || (WGSize & (WGSize - 1 ))) ? 1 : 0 ;
936
+ bool IsEfficientCase = ! IsUnderLoaded && (WGSize & (WGSize - 1 )) == 0 ;
936
937
937
938
bool IsUpdateOfUserAcc =
938
939
Reduction::accessor_mode == access::mode::read_write &&
939
940
NWorkGroups == 1 ;
940
941
941
942
// Use local memory to reduce elements in work-groups into 0-th element.
942
943
// If WGSize is not power of two, then WGSize+1 elements are allocated.
943
- // The additional last element is used to catch reduce elements that
944
- // could otherwise be lost in the tree-reduction algorithm.
945
- auto LocalReds = Redu.getReadWriteLocalAcc (WGSize + InefficientCase, *this );
944
+ // The additional last element is used to catch elements that could
945
+ // otherwise be lost in the tree-reduction algorithm.
946
+ size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
947
+ auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
946
948
947
949
// Get read accessor to the buffer that was used as output
948
950
// in the previous kernel. After that create new output buffer if needed
@@ -951,7 +953,7 @@ class __SYCL_EXPORT handler {
951
953
auto In = Redu.getReadAccToPreviousPartialReds (*this );
952
954
auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, KernelRun, *this );
953
955
954
- if (!InefficientCase ) {
956
+ if (IsEfficientCase ) {
955
957
// Efficient case: work-groups are fully loaded and work-group size
956
958
// is power of two.
957
959
using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t <
@@ -972,7 +974,7 @@ class __SYCL_EXPORT handler {
972
974
NDIt.barrier ();
973
975
}
974
976
975
- // Compute the the partial sum/reduction for the work-group.
977
+ // Compute the partial sum/reduction for the work-group.
976
978
if (LID == 0 )
977
979
Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
978
980
IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
@@ -1010,7 +1012,7 @@ class __SYCL_EXPORT handler {
1010
1012
PrevStep = CurStep;
1011
1013
}
1012
1014
1013
- // Compute the the partial sum/reduction for the work-group.
1015
+ // Compute the partial sum/reduction for the work-group.
1014
1016
if (LID == 0 ) {
1015
1017
auto GrID = NDIt.get_group_linear_id ();
1016
1018
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
@@ -1096,7 +1098,7 @@ class __SYCL_EXPORT handler {
1096
1098
handler AuxHandler (QueueCopy, MIsHost);
1097
1099
AuxHandler.saveCodeLoc (MCodeLoc);
1098
1100
1099
- // The last kernel DOES write to reductions 's accessor.
1101
+ // The last kernel DOES write to reduction 's accessor.
1100
1102
// Associate it with handler manually.
1101
1103
if (NWorkGroups == 1 )
1102
1104
AuxHandler.associateWithHandler (Redu.MAcc );
0 commit comments