Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 068869e

Browse files
authored
[SYCL][Fusion] Test kernel fusion and optimization (#1535)
Test different scenarios for kernel fusion, including creation of the fused kernel by the JIT compiler and performance optimizations such as dataflow internalization. Automatically detect availability of the kernel fusion extension in the DPC++ build in `lit.cfg.py` and make it available for `REQUIRES` clauses. Spec: intel/llvm#7098 Implementation: intel/llvm#7831 Signed-off-by: Lukas Sommer <[email protected]>
1 parent 972156c commit 068869e

29 files changed

+2140
-8
lines changed

SYCL/KernelFusion/abort_fusion.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test fusion being aborted: Different scenarios causing the JIT compiler
10+
// to abort fusion due to constraint violations for fusion. Also check that
11+
// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
17+
constexpr size_t dataSize = 512;
18+
19+
enum class Internalization { None, Local, Private };
20+
21+
template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
22+
void performFusion(queue &q, range<Kernel1Dim> k1Global,
23+
range<Kernel1Dim> k1Local) {
24+
int in[dataSize], tmp[dataSize], out[dataSize];
25+
26+
for (size_t i = 0; i < dataSize; ++i) {
27+
in[i] = i;
28+
tmp[i] = -1;
29+
out[i] = -1;
30+
}
31+
{
32+
buffer<int> bIn{in, range{dataSize}};
33+
buffer<int> bTmp{tmp, range{dataSize}};
34+
buffer<int> bOut{out, range{dataSize}};
35+
36+
ext::codeplay::experimental::fusion_wrapper fw(q);
37+
fw.start_fusion();
38+
39+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
40+
41+
q.submit([&](handler &cgh) {
42+
auto accIn = bIn.get_access(cgh);
43+
auto accTmp = bTmp.get_access(cgh);
44+
cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
45+
[=](item<Kernel1Dim> i) {
46+
auto LID = i.get_linear_id();
47+
accTmp[LID] = accIn[LID] + 5;
48+
});
49+
});
50+
51+
q.submit([&](handler &cgh) {
52+
auto accTmp = bTmp.get_access(cgh);
53+
auto accOut = bOut.get_access(cgh);
54+
cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
55+
accOut[i] = accTmp[i] * 2;
56+
});
57+
});
58+
59+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
60+
61+
assert(!fw.is_in_fusion_mode() &&
62+
"Queue should not be in fusion mode anymore");
63+
}
64+
65+
// Check the results
66+
size_t numErrors = 0;
67+
for (size_t i = 0; i < k1Global.size(); ++i) {
68+
if (out[i] != ((i + 5) * 2)) {
69+
++numErrors;
70+
}
71+
}
72+
if (numErrors) {
73+
std::cout << "COMPUTATION ERROR\n";
74+
} else {
75+
std::cout << "COMPUTATION OK\n";
76+
}
77+
}
78+
79+
int main() {
80+
81+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
82+
83+
// Scenario: Fusing two kernels with different dimensionality should lead to
84+
// fusion being aborted.
85+
performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
86+
range<2>{1, 8});
87+
// CHECK: WARNING: Cannot fuse kernels with different dimensionality
88+
// CHECK-NEXT: COMPUTATION OK
89+
90+
// Scenario: Fusing two kernels with different global size should lead to
91+
// fusion being aborted.
92+
performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
93+
range<1>{8});
94+
// CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
95+
// CHECK-NEXT: COMPUTATION OK
96+
97+
// Scenario: Fusing two kernels with different local size should lead to
98+
// fusion being aborted.
99+
performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
100+
range<1>{16});
101+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
102+
// CHECK-NEXT: COMPUTATION OK
103+
104+
return 0;
105+
}
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test incomplete internalization: Different scenarios causing the JIT compiler
10+
// to abort internalization due to target or parameter mismatch. Also check that
11+
// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
17+
constexpr size_t dataSize = 512;
18+
19+
enum class Internalization { None, Local, Private };
20+
21+
void performFusion(queue &q, Internalization intKernel1,
22+
size_t localSizeKernel1, Internalization intKernel2,
23+
size_t localSizeKernel2,
24+
bool expectInternalization = false) {
25+
int in[dataSize], tmp[dataSize], out[dataSize];
26+
for (size_t i = 0; i < dataSize; ++i) {
27+
in[i] = i;
28+
tmp[i] = -1;
29+
out[i] = -1;
30+
}
31+
{
32+
buffer<int> bIn{in, range{dataSize}};
33+
buffer<int> bTmp{tmp, range{dataSize}};
34+
buffer<int> bOut{out, range{dataSize}};
35+
36+
ext::codeplay::experimental::fusion_wrapper fw{q};
37+
fw.start_fusion();
38+
39+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
40+
41+
q.submit([&](handler &cgh) {
42+
auto accIn = bIn.get_access(cgh);
43+
property_list properties{};
44+
if (intKernel1 == Internalization::Private) {
45+
properties = {
46+
sycl::ext::codeplay::experimental::property::promote_private{}};
47+
} else if (intKernel1 == Internalization::Local) {
48+
properties = {
49+
sycl::ext::codeplay::experimental::property::promote_local{}};
50+
}
51+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
52+
53+
if (localSizeKernel1 > 0) {
54+
cgh.parallel_for<class Kernel1>(
55+
nd_range<1>{{dataSize}, {localSizeKernel1}},
56+
[=](id<1> i) { accTmp[i] = accIn[i] + 5; });
57+
} else {
58+
cgh.parallel_for<class KernelOne>(
59+
dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
60+
}
61+
});
62+
63+
q.submit([&](handler &cgh) {
64+
property_list properties{};
65+
if (intKernel2 == Internalization::Private) {
66+
properties = {
67+
sycl::ext::codeplay::experimental::property::promote_private{}};
68+
} else if (intKernel2 == Internalization::Local) {
69+
properties = {
70+
sycl::ext::codeplay::experimental::property::promote_local{}};
71+
}
72+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
73+
auto accOut = bOut.get_access(cgh);
74+
if (localSizeKernel2 > 0) {
75+
cgh.parallel_for<class Kernel2>(
76+
nd_range<1>{{dataSize}, {localSizeKernel2}},
77+
[=](id<1> i) { accOut[i] = accTmp[i] * 2; });
78+
} else {
79+
cgh.parallel_for<class KernelTwo>(
80+
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
81+
}
82+
});
83+
84+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
85+
86+
assert(!fw.is_in_fusion_mode() &&
87+
"Queue should not be in fusion mode anymore");
88+
}
89+
90+
// Check the results
91+
size_t numErrors = 0;
92+
size_t numInternalized = 0;
93+
for (size_t i = 0; i < dataSize; ++i) {
94+
if (out[i] != ((i + 5) * 2)) {
95+
++numErrors;
96+
}
97+
if (tmp[i] == -1) {
98+
++numInternalized;
99+
}
100+
}
101+
if (numErrors) {
102+
std::cout << "COMPUTATION ERROR\n";
103+
return;
104+
}
105+
if (!expectInternalization && numInternalized) {
106+
std::cout << "WRONG INTERNALIZATION\n";
107+
return;
108+
}
109+
std::cout << "COMPUTATION OK\n";
110+
}
111+
112+
int main() {
113+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
114+
115+
// Scenario: One accessor without internalization, one with local
116+
// internalization. Should fall back to no internalization and print a
117+
// warning.
118+
std::cout << "None, Local(0)\n";
119+
performFusion(q, Internalization::None, 0, Internalization::Local, 0);
120+
// CHECK: None, Local(0)
121+
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
122+
// CHECK-NEXT: COMPUTATION OK
123+
124+
// Scenario: One accessor without internalization, one with private
125+
// internalization. Should fall back to no internalization and print a
126+
// warning.
127+
std::cout << "None, Private\n";
128+
performFusion(q, Internalization::None, 0, Internalization::Private, 0);
129+
// CHECK-NEXT: None, Private
130+
// CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion
131+
// CHECK-NEXT: COMPUTATION OK
132+
133+
// Scenario: Both accessor with local promotion, but the second kernel does
134+
// not specify a work-group size. No promotion should happen and a warning
135+
// should be printed.
136+
std::cout << "Local(8), Local(0)\n";
137+
performFusion(q, Internalization::Local, 8, Internalization::Local, 0);
138+
// CHECK-NEXT: Local(8), Local(0)
139+
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
140+
// CHECK-NEXT: COMPUTATION OK
141+
142+
// Scenario: Both accessor with local promotion, but the first kernel does
143+
// not specify a work-group size. No promotion should happen and a warning
144+
// should be printed.
145+
std::cout << "Local(0), Local(8)\n";
146+
performFusion(q, Internalization::Local, 0, Internalization::Local, 8);
147+
// CHECK-NEXT: Local(0), Local(8)
148+
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
149+
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
150+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
151+
// CHECK-NEXT: COMPUTATION OK
152+
153+
// Scenario: Both accessor with local promotion, but the kernels specify
154+
// different work-group sizes. No promotion should happen and a warning should
155+
// be printed.
156+
std::cout << "Local(8), Local(16)\n";
157+
performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
158+
// CHECK-NEXT: Local(8), Local(16)
159+
// CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
160+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
161+
// CHECK-NEXT: COMPUTATION OK
162+
163+
// Scenario: One accessor with local internalization, one with private
164+
// internalization. Should fall back to local internalization and print a
165+
// warning.
166+
std::cout << "Local(8), Private(8)\n";
167+
performFusion(q, Internalization::Local, 8, Internalization::Private, 8,
168+
/* expectInternalization */ true);
169+
// CHECK-NEXT: Local(8), Private(8)
170+
// CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion
171+
// CHECK-NEXT: COMPUTATION OK
172+
173+
return 0;
174+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// UNSUPPORTED: cuda || hip
5+
// REQUIRES: fusion
6+
7+
// Test complete fusion with local internalization and a combination of kernels
8+
// that require a work-group barrier to be inserted by fusion.
9+
10+
#include <sycl/sycl.hpp>
11+
12+
using namespace sycl;
13+
14+
int main() {
15+
constexpr size_t dataSize = 512;
16+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
17+
18+
for (size_t i = 0; i < dataSize; ++i) {
19+
in1[i] = i * 2;
20+
in2[i] = i * 3;
21+
in3[i] = i * 4;
22+
tmp[i] = -1;
23+
out[i] = -1;
24+
}
25+
26+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
27+
28+
{
29+
buffer<int> bIn1{in1, range{dataSize}};
30+
buffer<int> bIn2{in2, range{dataSize}};
31+
buffer<int> bIn3{in3, range{dataSize}};
32+
buffer<int> bTmp{
33+
tmp,
34+
range{dataSize},
35+
{sycl::ext::codeplay::experimental::property::promote_local{}}};
36+
buffer<int> bOut{out, range{dataSize}};
37+
38+
ext::codeplay::experimental::fusion_wrapper fw{q};
39+
fw.start_fusion();
40+
41+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
42+
43+
q.submit([&](handler &cgh) {
44+
auto accIn1 = bIn1.get_access(cgh);
45+
auto accIn2 = bIn2.get_access(cgh);
46+
auto accTmp = bTmp.get_access(cgh);
47+
cgh.parallel_for<class KernelOne>(
48+
nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
49+
auto workgroupSize = i.get_local_range(0);
50+
auto baseOffset = i.get_group_linear_id() * workgroupSize;
51+
auto localIndex = i.get_local_linear_id();
52+
auto localOffset = (workgroupSize - 1) - localIndex;
53+
accTmp[baseOffset + localOffset] =
54+
accIn1[baseOffset + localOffset] +
55+
accIn2[baseOffset + localOffset];
56+
});
57+
});
58+
59+
q.submit([&](handler &cgh) {
60+
auto accTmp = bTmp.get_access(cgh);
61+
auto accIn3 = bIn3.get_access(cgh);
62+
auto accOut = bOut.get_access(cgh);
63+
cgh.parallel_for<class KernelTwo>(
64+
nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
65+
auto index = i.get_global_linear_id();
66+
accOut[index] = accTmp[index] * accIn3[index];
67+
});
68+
});
69+
70+
fw.complete_fusion();
71+
72+
assert(!fw.is_in_fusion_mode() &&
73+
"Queue should not be in fusion mode anymore");
74+
}
75+
76+
// Check the results
77+
for (size_t i = 0; i < dataSize; ++i) {
78+
assert(out[i] == (20 * i * i) && "Computation error");
79+
assert(tmp[i] == -1 && "Not internalized");
80+
}
81+
82+
return 0;
83+
}

0 commit comments

Comments
 (0)