[SYCL] Do additional changes per reviewer's comments, fix regressed LIT tests

v-klochkov · v-klochkov · commit a4097523ce31 · 2020-04-27T13:11:33.000-07:00
Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -232,7 +232,7 @@ class __SYCL_EXPORT handler {
   void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
 
   /// Stores the given \param Event to the \param Queue.
-  /// Even thought MQueue is a field of handler, the method addEvent() of
+  /// Even though MQueue is a field of handler, the method addEvent() of
   /// queue_impl class cannot be called inside this handler.hpp file
   /// as queue_impl is incomplete class for handler.
   static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
@@ -814,7 +814,7 @@ class __SYCL_EXPORT handler {
   /// user's lambda function \param KernelFunc and does one iteration of
   /// reduction of elements in each of work-groups.
   /// This version uses tree-reduction algorithm to reduce elements in each
-  /// of work-groups. At the end of each work-groups the partial sum is written
+  /// of work-groups. At the end of each work-group the partial sum is written
   /// to a global buffer.
   ///
   /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
@@ -827,21 +827,22 @@ class __SYCL_EXPORT handler {
     size_t NWorkGroups = Range.get_group_range().size();
 
     bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
-    size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0;
+    bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
 
     bool IsUpdateOfUserAcc =
         Reduction::accessor_mode == access::mode::read_write &&
         NWorkGroups == 1;
 
     // Use local memory to reduce elements in work-groups into 0-th element.
     // If WGSize is not power of two, then WGSize+1 elements are allocated.
-    // The additional last element is used to catch reduce elements that could
-    // otherwise be lost in the tree-reduction algorithm used in the kernel.
-    auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this);
+    // The additional last element is used to catch elements that could
+    // otherwise be lost in the tree-reduction algorithm.
+    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
 
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
     auto ReduIdentity = Redu.getIdentity();
-    if (!InefficientCase) {
+    if (IsEfficientCase) {
       // Efficient case: work-groups are fully loaded and work-group size
       // is power of two.
       parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
@@ -863,7 +864,7 @@ class __SYCL_EXPORT handler {
           NDIt.barrier();
         }
 
-        // Compute the the partial sum/reduction for the work-group.
+        // Compute the partial sum/reduction for the work-group.
         if (LID == 0)
           Out.get_pointer().get()[NDIt.get_group_linear_id()] =
               IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
@@ -904,7 +905,7 @@ class __SYCL_EXPORT handler {
           PrevStep = CurStep;
         }
 
-        // Compute the the partial sum/reduction for the work-group.
+        // Compute the partial sum/reduction for the work-group.
         if (LID == 0) {
           auto GrID = NDIt.get_group_linear_id();
           auto V = BOp(LocalReds[0], LocalReds[WGSize]);
@@ -918,7 +919,7 @@ class __SYCL_EXPORT handler {
   /// Implements a command group function that enqueues a kernel that does one
   /// iteration of reduction of elements in each of work-groups.
   /// This version uses tree-reduction algorithm to reduce elements in each
-  /// of work-groups. At the end of each work-groups the partial sum is written
+  /// of work-groups. At the end of each work-group the partial sum is written
   /// to a global buffer.
   ///
   /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
@@ -932,17 +933,18 @@ class __SYCL_EXPORT handler {
     // size may be not power of those. Those two cases considered inefficient
     // as they require additional code and checks in the kernel.
     bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
-    size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0;
+    bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
 
     bool IsUpdateOfUserAcc =
         Reduction::accessor_mode == access::mode::read_write &&
         NWorkGroups == 1;
 
     // Use local memory to reduce elements in work-groups into 0-th element.
     // If WGSize is not power of two, then WGSize+1 elements are allocated.
-    // The additional last element is used to catch reduce elements that
-    // could otherwise be lost in the tree-reduction algorithm.
-    auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this);
+    // The additional last element is used to catch elements that could
+    // otherwise be lost in the tree-reduction algorithm.
+    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
 
     // Get read accessor to the buffer that was used as output
     // in the previous kernel. After that create new output buffer if needed
@@ -951,7 +953,7 @@ class __SYCL_EXPORT handler {
     auto In = Redu.getReadAccToPreviousPartialReds(*this);
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
 
-    if (!InefficientCase) {
+    if (IsEfficientCase) {
       // Efficient case: work-groups are fully loaded and work-group size
       // is power of two.
       using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
@@ -972,7 +974,7 @@ class __SYCL_EXPORT handler {
           NDIt.barrier();
         }
 
-        // Compute the the partial sum/reduction for the work-group.
+        // Compute the partial sum/reduction for the work-group.
         if (LID == 0)
           Out.get_pointer().get()[NDIt.get_group_linear_id()] =
               IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
@@ -1010,7 +1012,7 @@ class __SYCL_EXPORT handler {
           PrevStep = CurStep;
         }
 
-        // Compute the the partial sum/reduction for the work-group.
+        // Compute the partial sum/reduction for the work-group.
         if (LID == 0) {
           auto GrID = NDIt.get_group_linear_id();
           auto V = BOp(LocalReds[0], LocalReds[WGSize]);
@@ -1096,7 +1098,7 @@ class __SYCL_EXPORT handler {
       handler AuxHandler(QueueCopy, MIsHost);
       AuxHandler.saveCodeLoc(MCodeLoc);
 
-      // The last kernel DOES write to reductions's accessor.
+      // The last kernel DOES write to reduction's accessor.
       // Associate it with handler manually.
       if (NWorkGroups == 1)
         AuxHandler.associateWithHandler(Redu.MAcc);
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
@@ -25,8 +25,8 @@ void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
 }
 
 event handler::finalize() {
-  // This block of code is needed only to 5th/default reduction implementation.
-  // It is harmless (does nothing) for other implementations.
+  // This block of code is needed only for reduction implementation.
+  // It is harmless (does nothing) for everything else.
   if (MIsFinalized)
     return MLastEvent;
   MIsFinalized = true;
diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp
@@ -43,7 +43,11 @@ int main() {
   check_size<device_selector, 8>();
   check_size<event, 16>();
   check_size<gpu_selector, 8>();
+#ifdef _MSC_VER
+  check_size<handler, 520>();
+#else
   check_size<handler, 528>();
+#endif
   check_size<image<1>, 16>();
   check_size<kernel, 16>();
   check_size<platform, 16>();
diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp
@@ -1,17 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-//==---reduction_nd_conditional.cpp - SYCL reduction + condition test ------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reduction and conditional increment of the reduction variable.
 
@@ -41,9 +33,7 @@ void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
 };
 
 template <typename T, int Dim, class BinaryOperation>
-class Known;
-template <typename T, int Dim, class BinaryOperation>
-class Unknown;
+class SomeClass;
 
 template <typename T>
 struct Vec {
@@ -97,7 +87,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
-    CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
         NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
           size_t I = NDIt.get_global_linear_id();
           if (I < 2)
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -1,73 +1,20 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with 0-dimensional discard_write accessor.
 
+#include "reduction_utils.hpp"
 #include <CL/sycl.hpp>
 #include <cassert>
 
 using namespace cl::sycl;
 
-template <typename T, class BinaryOperation>
-void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
-                   BinaryOperation BOp, size_t N) {
-  ExpectedOut = Identity;
-  auto In = InBuf.template get_access<access::mode::write>();
-  for (int I = 0; I < N; ++I) {
-    if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
-      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
-    else
-      In[I] = ((I + 1) % 5) + 1.1;
-    ExpectedOut = BOp(ExpectedOut, In[I]);
-  }
-};
-
-template <typename T, int Dim, class BinaryOperation>
-class Known;
 template <typename T, int Dim, class BinaryOperation>
-class Unknown;
-
-template <typename T>
-struct Vec {
-  Vec() : X(0), Y(0) {}
-  Vec(T X, T Y) : X(X), Y(Y) {}
-  Vec(T V) : X(V), Y(V) {}
-  bool operator==(const Vec &P) const {
-    return P.X == X && P.Y == Y;
-  }
-  bool operator!=(const Vec &P) const {
-    return !(*this == P);
-  }
-  T X;
-  T Y;
-};
-template <typename T>
-bool operator==(const Vec<T> &A, const Vec<T> &B) {
-  return A.X == B.X && A.Y == B.Y;
-}
-template <typename T>
-std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
-  return OS << "(" << P.X << ", " << P.Y << ")";
-}
-
-template <class T>
-struct VecPlus {
-  using P = Vec<T>;
-  P operator()(const P &A, const P &B) const {
-    return P(A.X + B.X, A.Y + B.Y);
-  }
-};
+class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -90,7 +37,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
-    CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
         NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
@@ -142,7 +89,7 @@ int main() {
   test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
 
   // Check with CUSTOM type.
-  test<Vec<long long>, 0, VecPlus<long long>>(Vec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -1,73 +1,20 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with 0-dimensional read_write accessor.
 
+#include "reduction_utils.hpp"
 #include <CL/sycl.hpp>
 #include <cassert>
 
 using namespace cl::sycl;
 
-template <typename T, class BinaryOperation>
-void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
-                   BinaryOperation BOp, size_t N) {
-  ExpectedOut = Identity;
-  auto In = InBuf.template get_access<access::mode::write>();
-  for (int I = 0; I < N; ++I) {
-    if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
-      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
-    else
-      In[I] = ((I + 1) % 5) + 1.1;
-    ExpectedOut = BOp(ExpectedOut, In[I]);
-  }
-};
-
-template <typename T, int Dim, class BinaryOperation>
-class Known;
 template <typename T, int Dim, class BinaryOperation>
-class Unknown;
-
-template <typename T>
-struct Vec {
-  Vec() : X(0), Y(0) {}
-  Vec(T X, T Y) : X(X), Y(Y) {}
-  Vec(T V) : X(V), Y(V) {}
-  bool operator==(const Vec &P) const {
-    return P.X == X && P.Y == Y;
-  }
-  bool operator!=(const Vec &P) const {
-    return !(*this == P);
-  }
-  T X;
-  T Y;
-};
-template <typename T>
-bool operator==(const Vec<T> &A, const Vec<T> &B) {
-  return A.X == B.X && A.Y == B.Y;
-}
-template <typename T>
-std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
-  return OS << "(" << P.X << ", " << P.Y << ")";
-}
-
-template <class T>
-struct VecPlus {
-  using P = Vec<T>;
-  P operator()(const P &A, const P &B) const {
-    return P(A.X + B.X, A.Y + B.Y);
-  }
-};
+class SomeClass;
 
 template <typename T, int Dim, class BinaryOperation>
 void test(T Identity, size_t WGSize, size_t NWItems) {
@@ -92,7 +39,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
-    CGH.parallel_for<Known<T, Dim, BinaryOperation>>(
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
         NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
@@ -144,7 +91,7 @@ int main() {
   test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
 
   // Check with CUSTOM type.
-  test<Vec<long long>, 0, VecPlus<long long>>(Vec<long long>(0), 8, 256);
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp