Skip to content

Commit a409752

Browse files
committed
[SYCL] Do additional changes per reviewer's comments, fix regressed LIT tests
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 3b4af65 commit a409752

File tree

9 files changed

+103
-265
lines changed

9 files changed

+103
-265
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ class __SYCL_EXPORT handler {
232232
void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
233233

234234
/// Stores the given \param Event to the \param Queue.
235-
/// Even thought MQueue is a field of handler, the method addEvent() of
235+
/// Even though MQueue is a field of handler, the method addEvent() of
236236
/// queue_impl class cannot be called inside this handler.hpp file
237237
/// as queue_impl is incomplete class for handler.
238238
static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
@@ -814,7 +814,7 @@ class __SYCL_EXPORT handler {
814814
/// user's lambda function \param KernelFunc and does one iteration of
815815
/// reduction of elements in each of work-groups.
816816
/// This version uses tree-reduction algorithm to reduce elements in each
817-
/// of work-groups. At the end of each work-groups the partial sum is written
817+
/// of work-groups. At the end of each work-group the partial sum is written
818818
/// to a global buffer.
819819
///
820820
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
@@ -827,21 +827,22 @@ class __SYCL_EXPORT handler {
827827
size_t NWorkGroups = Range.get_group_range().size();
828828

829829
bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
830-
size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0;
830+
bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
831831

832832
bool IsUpdateOfUserAcc =
833833
Reduction::accessor_mode == access::mode::read_write &&
834834
NWorkGroups == 1;
835835

836836
// Use local memory to reduce elements in work-groups into 0-th element.
837837
// If WGSize is not power of two, then WGSize+1 elements are allocated.
838-
// The additional last element is used to catch reduce elements that could
839-
// otherwise be lost in the tree-reduction algorithm used in the kernel.
840-
auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this);
838+
// The additional last element is used to catch elements that could
839+
// otherwise be lost in the tree-reduction algorithm.
840+
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
841+
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
841842

842843
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
843844
auto ReduIdentity = Redu.getIdentity();
844-
if (!InefficientCase) {
845+
if (IsEfficientCase) {
845846
// Efficient case: work-groups are fully loaded and work-group size
846847
// is power of two.
847848
parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
@@ -863,7 +864,7 @@ class __SYCL_EXPORT handler {
863864
NDIt.barrier();
864865
}
865866

866-
// Compute the the partial sum/reduction for the work-group.
867+
// Compute the partial sum/reduction for the work-group.
867868
if (LID == 0)
868869
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
869870
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
@@ -904,7 +905,7 @@ class __SYCL_EXPORT handler {
904905
PrevStep = CurStep;
905906
}
906907

907-
// Compute the the partial sum/reduction for the work-group.
908+
// Compute the partial sum/reduction for the work-group.
908909
if (LID == 0) {
909910
auto GrID = NDIt.get_group_linear_id();
910911
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
@@ -918,7 +919,7 @@ class __SYCL_EXPORT handler {
918919
/// Implements a command group function that enqueues a kernel that does one
919920
/// iteration of reduction of elements in each of work-groups.
920921
/// This version uses tree-reduction algorithm to reduce elements in each
921-
/// of work-groups. At the end of each work-groups the partial sum is written
922+
/// of work-groups. At the end of each work-group the partial sum is written
922923
/// to a global buffer.
923924
///
924925
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
@@ -932,17 +933,18 @@ class __SYCL_EXPORT handler {
932933
// size may be not power of those. Those two cases considered inefficient
933934
// as they require additional code and checks in the kernel.
934935
bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
935-
size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0;
936+
bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
936937

937938
bool IsUpdateOfUserAcc =
938939
Reduction::accessor_mode == access::mode::read_write &&
939940
NWorkGroups == 1;
940941

941942
// Use local memory to reduce elements in work-groups into 0-th element.
942943
// If WGSize is not power of two, then WGSize+1 elements are allocated.
943-
// The additional last element is used to catch reduce elements that
944-
// could otherwise be lost in the tree-reduction algorithm.
945-
auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this);
944+
// The additional last element is used to catch elements that could
945+
// otherwise be lost in the tree-reduction algorithm.
946+
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
947+
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
946948

947949
// Get read accessor to the buffer that was used as output
948950
// in the previous kernel. After that create new output buffer if needed
@@ -951,7 +953,7 @@ class __SYCL_EXPORT handler {
951953
auto In = Redu.getReadAccToPreviousPartialReds(*this);
952954
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
953955

954-
if (!InefficientCase) {
956+
if (IsEfficientCase) {
955957
// Efficient case: work-groups are fully loaded and work-group size
956958
// is power of two.
957959
using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
@@ -972,7 +974,7 @@ class __SYCL_EXPORT handler {
972974
NDIt.barrier();
973975
}
974976

975-
// Compute the the partial sum/reduction for the work-group.
977+
// Compute the partial sum/reduction for the work-group.
976978
if (LID == 0)
977979
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
978980
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
@@ -1010,7 +1012,7 @@ class __SYCL_EXPORT handler {
10101012
PrevStep = CurStep;
10111013
}
10121014

1013-
// Compute the the partial sum/reduction for the work-group.
1015+
// Compute the partial sum/reduction for the work-group.
10141016
if (LID == 0) {
10151017
auto GrID = NDIt.get_group_linear_id();
10161018
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
@@ -1096,7 +1098,7 @@ class __SYCL_EXPORT handler {
10961098
handler AuxHandler(QueueCopy, MIsHost);
10971099
AuxHandler.saveCodeLoc(MCodeLoc);
10981100

1099-
// The last kernel DOES write to reductions's accessor.
1101+
// The last kernel DOES write to reduction's accessor.
11001102
// Associate it with handler manually.
11011103
if (NWorkGroups == 1)
11021104
AuxHandler.associateWithHandler(Redu.MAcc);

sycl/source/handler.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
2525
}
2626

2727
event handler::finalize() {
28-
// This block of code is needed only to 5th/default reduction implementation.
29-
// It is harmless (does nothing) for other implementations.
28+
// This block of code is needed only for reduction implementation.
29+
// It is harmless (does nothing) for everything else.
3030
if (MIsFinalized)
3131
return MLastEvent;
3232
MIsFinalized = true;

sycl/test/abi/symbol_size.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ int main() {
4343
check_size<device_selector, 8>();
4444
check_size<event, 16>();
4545
check_size<gpu_selector, 8>();
46+
#ifdef _MSC_VER
47+
check_size<handler, 520>();
48+
#else
4649
check_size<handler, 528>();
50+
#endif
4751
check_size<image<1>, 16>();
4852
check_size<kernel, 16>();
4953
check_size<platform, 16>();

sycl/test/reduction/reduction_nd_conditional.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,9 @@
1-
// RUN: %clangxx -fsycl %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
22
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
33
// RUN: %CPU_RUN_PLACEHOLDER %t.out
44
// RUN: %GPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
66

7-
//==---reduction_nd_conditional.cpp - SYCL reduction + condition test ------==//
8-
//
9-
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
10-
// See https://llvm.org/LICENSE.txt for license information.
11-
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
12-
//
13-
//===----------------------------------------------------------------------===//
14-
157
// This test performs basic checks of parallel_for(nd_range, reduction, func)
168
// with reduction and conditional increment of the reduction variable.
179

@@ -41,9 +33,7 @@ void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
4133
};
4234

4335
template <typename T, int Dim, class BinaryOperation>
44-
class Known;
45-
template <typename T, int Dim, class BinaryOperation>
46-
class Unknown;
36+
class SomeClass;
4737

4838
template <typename T>
4939
struct Vec {
@@ -97,7 +87,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
9787
range<1> GlobalRange(NWItems);
9888
range<1> LocalRange(WGSize);
9989
nd_range<1> NDRange(GlobalRange, LocalRange);
100-
CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
90+
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
10191
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
10292
size_t I = NDIt.get_global_linear_id();
10393
if (I < 2)

sycl/test/reduction/reduction_nd_s0_dw.cpp

Lines changed: 5 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,20 @@
1-
// RUN: %clangxx -fsycl %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
22
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
33
// RUN: %CPU_RUN_PLACEHOLDER %t.out
44
// RUN: %GPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6-
//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
7-
//
8-
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9-
// See https://llvm.org/LICENSE.txt for license information.
10-
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11-
//
12-
//===----------------------------------------------------------------------===//
136

147
// This test performs basic checks of parallel_for(nd_range, reduction, func)
158
// with reductions initialized with 0-dimensional discard_write accessor.
169

10+
#include "reduction_utils.hpp"
1711
#include <CL/sycl.hpp>
1812
#include <cassert>
1913

2014
using namespace cl::sycl;
2115

22-
template <typename T, class BinaryOperation>
23-
void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
24-
BinaryOperation BOp, size_t N) {
25-
ExpectedOut = Identity;
26-
auto In = InBuf.template get_access<access::mode::write>();
27-
for (int I = 0; I < N; ++I) {
28-
if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
29-
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
30-
else
31-
In[I] = ((I + 1) % 5) + 1.1;
32-
ExpectedOut = BOp(ExpectedOut, In[I]);
33-
}
34-
};
35-
36-
template <typename T, int Dim, class BinaryOperation>
37-
class Known;
3816
template <typename T, int Dim, class BinaryOperation>
39-
class Unknown;
40-
41-
template <typename T>
42-
struct Vec {
43-
Vec() : X(0), Y(0) {}
44-
Vec(T X, T Y) : X(X), Y(Y) {}
45-
Vec(T V) : X(V), Y(V) {}
46-
bool operator==(const Vec &P) const {
47-
return P.X == X && P.Y == Y;
48-
}
49-
bool operator!=(const Vec &P) const {
50-
return !(*this == P);
51-
}
52-
T X;
53-
T Y;
54-
};
55-
template <typename T>
56-
bool operator==(const Vec<T> &A, const Vec<T> &B) {
57-
return A.X == B.X && A.Y == B.Y;
58-
}
59-
template <typename T>
60-
std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
61-
return OS << "(" << P.X << ", " << P.Y << ")";
62-
}
63-
64-
template <class T>
65-
struct VecPlus {
66-
using P = Vec<T>;
67-
P operator()(const P &A, const P &B) const {
68-
return P(A.X + B.X, A.Y + B.Y);
69-
}
70-
};
17+
class SomeClass;
7118

7219
template <typename T, int Dim, class BinaryOperation>
7320
void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -90,7 +37,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
9037
range<1> GlobalRange(NWItems);
9138
range<1> LocalRange(WGSize);
9239
nd_range<1> NDRange(GlobalRange, LocalRange);
93-
CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
40+
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
9441
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
9542
Sum.combine(In[NDIt.get_global_linear_id()]);
9643
});
@@ -142,7 +89,7 @@ int main() {
14289
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
14390

14491
// Check with CUSTOM type.
145-
test<Vec<long long>, 0, VecPlus<long long>>(Vec<long long>(0), 8, 256);
92+
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
14693

14794
std::cout << "Test passed\n";
14895
return 0;

sycl/test/reduction/reduction_nd_s0_rw.cpp

Lines changed: 5 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,20 @@
1-
// RUN: %clangxx -fsycl %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
22
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
33
// RUN: %CPU_RUN_PLACEHOLDER %t.out
44
// RUN: %GPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6-
//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
7-
//
8-
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9-
// See https://llvm.org/LICENSE.txt for license information.
10-
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11-
//
12-
//===----------------------------------------------------------------------===//
136

147
// This test performs basic checks of parallel_for(nd_range, reduction, func)
158
// with reductions initialized with 0-dimensional read_write accessor.
169

10+
#include "reduction_utils.hpp"
1711
#include <CL/sycl.hpp>
1812
#include <cassert>
1913

2014
using namespace cl::sycl;
2115

22-
template <typename T, class BinaryOperation>
23-
void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
24-
BinaryOperation BOp, size_t N) {
25-
ExpectedOut = Identity;
26-
auto In = InBuf.template get_access<access::mode::write>();
27-
for (int I = 0; I < N; ++I) {
28-
if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
29-
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
30-
else
31-
In[I] = ((I + 1) % 5) + 1.1;
32-
ExpectedOut = BOp(ExpectedOut, In[I]);
33-
}
34-
};
35-
36-
template <typename T, int Dim, class BinaryOperation>
37-
class Known;
3816
template <typename T, int Dim, class BinaryOperation>
39-
class Unknown;
40-
41-
template <typename T>
42-
struct Vec {
43-
Vec() : X(0), Y(0) {}
44-
Vec(T X, T Y) : X(X), Y(Y) {}
45-
Vec(T V) : X(V), Y(V) {}
46-
bool operator==(const Vec &P) const {
47-
return P.X == X && P.Y == Y;
48-
}
49-
bool operator!=(const Vec &P) const {
50-
return !(*this == P);
51-
}
52-
T X;
53-
T Y;
54-
};
55-
template <typename T>
56-
bool operator==(const Vec<T> &A, const Vec<T> &B) {
57-
return A.X == B.X && A.Y == B.Y;
58-
}
59-
template <typename T>
60-
std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
61-
return OS << "(" << P.X << ", " << P.Y << ")";
62-
}
63-
64-
template <class T>
65-
struct VecPlus {
66-
using P = Vec<T>;
67-
P operator()(const P &A, const P &B) const {
68-
return P(A.X + B.X, A.Y + B.Y);
69-
}
70-
};
17+
class SomeClass;
7118

7219
template <typename T, int Dim, class BinaryOperation>
7320
void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -92,7 +39,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
9239
range<1> GlobalRange(NWItems);
9340
range<1> LocalRange(WGSize);
9441
nd_range<1> NDRange(GlobalRange, LocalRange);
95-
CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
42+
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
9643
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
9744
Sum.combine(In[NDIt.get_global_linear_id()]);
9845
});
@@ -144,7 +91,7 @@ int main() {
14491
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
14592

14693
// Check with CUSTOM type.
147-
test<Vec<long long>, 0, VecPlus<long long>>(Vec<long long>(0), 8, 256);
94+
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
14895

14996
std::cout << "Test passed\n";
15097
return 0;

0 commit comments

Comments
 (0)