intel
diff --git a/‎SYCL/KernelFusion/abort_fusion.cpp
Lines changed: 105 additions & 0 deletions b/‎SYCL/KernelFusion/abort_fusion.cpp
Lines changed: 105 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/abort_internalization.cpp
Lines changed: 174 additions & 0 deletions b/‎SYCL/KernelFusion/abort_internalization.cpp
Lines changed: 174 additions & 0 deletions
diff --git a/‎SYCL/KernelFusion/barrier_local_internalization.cpp
Lines changed: 83 additions & 0 deletions b/‎SYCL/KernelFusion/barrier_local_internalization.cpp
Lines changed: 83 additions & 0 deletions
@@ -0,0 +1,105 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test fusion being aborted: Different scenarios causing the JIT compiler
+// to abort fusion due to constraint violations for fusion. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
+void performFusion(queue &q, range<Kernel1Dim> k1Global,
+                   range<Kernel1Dim> k1Local) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw(q);
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
+                                    [=](item<Kernel1Dim> i) {
+                                      auto LID = i.get_linear_id();
+                                      accTmp[LID] = accIn[LID] + 5;
+                                    });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
+        accOut[i] = accTmp[i] * 2;
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  for (size_t i = 0; i < k1Global.size(); ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+  } else {
+    std::cout << "COMPUTATION OK\n";
+  }
+}
+
+int main() {
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: Fusing two kernels with different dimensionality should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
+                                                  range<2>{1, 8});
+  // CHECK: WARNING: Cannot fuse kernels with different dimensionality
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different global size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
+                                                  range<1>{8});
+  // CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different local size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
+                                                  range<1>{16});
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
@@ -0,0 +1,174 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test incomplete internalization: Different scenarios causing the JIT compiler
+// to abort internalization due to target or parameter mismatch. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+void performFusion(queue &q, Internalization intKernel1,
+                   size_t localSizeKernel1, Internalization intKernel2,
+                   size_t localSizeKernel2,
+                   bool expectInternalization = false) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      property_list properties{};
+      if (intKernel1 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel1 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+
+      if (localSizeKernel1 > 0) {
+        cgh.parallel_for<class Kernel1>(
+            nd_range<1>{{dataSize}, {localSizeKernel1}},
+            [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      } else {
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      }
+    });
+
+    q.submit([&](handler &cgh) {
+      property_list properties{};
+      if (intKernel2 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel2 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+      auto accOut = bOut.get_access(cgh);
+      if (localSizeKernel2 > 0) {
+        cgh.parallel_for<class Kernel2>(
+            nd_range<1>{{dataSize}, {localSizeKernel2}},
+            [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      } else {
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      }
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  size_t numInternalized = 0;
+  for (size_t i = 0; i < dataSize; ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+    if (tmp[i] == -1) {
+      ++numInternalized;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+    return;
+  }
+  if (!expectInternalization && numInternalized) {
+    std::cout << "WRONG INTERNALIZATION\n";
+    return;
+  }
+  std::cout << "COMPUTATION OK\n";
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: One accessor without internalization, one with local
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Local(0)\n";
+  performFusion(q, Internalization::None, 0, Internalization::Local, 0);
+  // CHECK: None, Local(0)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor without internalization, one with private
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Private\n";
+  performFusion(q, Internalization::None, 0, Internalization::Private, 0);
+  // CHECK-NEXT: None, Private
+  // CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the second kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(8), Local(0)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 0);
+  // CHECK-NEXT: Local(8), Local(0)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the first kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(0), Local(8)\n";
+  performFusion(q, Internalization::Local, 0, Internalization::Local, 8);
+  // CHECK-NEXT: Local(0), Local(8)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the kernels specify
+  // different work-group sizes. No promotion should happen and a warning should
+  // be printed.
+  std::cout << "Local(8), Local(16)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
+  // CHECK-NEXT: Local(8), Local(16)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor with local internalization, one with private
+  // internalization. Should fall back to local internalization and print a
+  // warning.
+  std::cout << "Local(8), Private(8)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Private, 8,
+                /* expectInternalization */ true);
+  // CHECK-NEXT: Local(8), Private(8)
+  // CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
@@ -0,0 +1,83 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and a combination of kernels
+// that require a work-group barrier to be inserted by fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{
+        tmp,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto workgroupSize = i.get_local_range(0);
+            auto baseOffset = i.get_group_linear_id() * workgroupSize;
+            auto localIndex = i.get_local_linear_id();
+            auto localOffset = (workgroupSize - 1) - localIndex;
+            accTmp[baseOffset + localOffset] =
+                accIn1[baseOffset + localOffset] +
+                accIn2[baseOffset + localOffset];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto index = i.get_global_linear_id();
+            accOut[index] = accTmp[index] * accIn3[index];
+          });
+    });
+
+    fw.complete_fusion();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}