Skip to content

Commit 9dce2d2

Browse files
authored
[SYCL][ESIMD] Add support for local accessors to lsc API (#10340)
1 parent 03c13fd commit 9dce2d2

File tree

3 files changed

+363
-29
lines changed

3 files changed

+363
-29
lines changed

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 130 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -718,8 +718,10 @@ template <typename T, int NElts = 1,
718718
lsc_data_size DS = lsc_data_size::default_size,
719719
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
720720
int N, typename AccessorTy>
721-
__ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
722-
__ESIMD_NS::simd<T, N * NElts>>
721+
__ESIMD_API std::enable_if_t<
722+
!std::is_pointer_v<AccessorTy> &&
723+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
724+
__ESIMD_NS::simd<T, N * NElts>>
723725
lsc_gather(AccessorTy acc,
724726
#ifdef __ESIMD_FORCE_STATELESS_MEM
725727
__ESIMD_NS::simd<uint64_t, N> offsets,
@@ -755,17 +757,31 @@ template <typename T, int NElts = 1,
755757
lsc_data_size DS = lsc_data_size::default_size,
756758
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
757759
int N, typename AccessorTy, typename Toffset>
758-
__ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
759-
std::is_integral_v<Toffset> &&
760-
!std::is_same_v<Toffset, uint64_t>,
761-
__ESIMD_NS::simd<T, N * NElts>>
760+
__ESIMD_API std::enable_if_t<
761+
!std::is_pointer_v<AccessorTy> &&
762+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
763+
std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
764+
__ESIMD_NS::simd<T, N * NElts>>
762765
lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
763766
__ESIMD_NS::simd_mask<N> pred = 1) {
764767
return lsc_gather<T, NElts, DS, L1H, L3H, N, AccessorTy>(
765768
acc, convert<uint64_t>(offsets), pred);
766769
}
767770
#endif
768771

772+
template <typename T, int NElts = 1,
773+
lsc_data_size DS = lsc_data_size::default_size,
774+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
775+
int N, typename AccessorTy>
776+
__ESIMD_API std::enable_if_t<
777+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
778+
__ESIMD_NS::simd<T, N * NElts>>
779+
lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
780+
__ESIMD_NS::simd_mask<N> pred = 1) {
781+
return lsc_slm_gather<T, NElts, DS>(
782+
offsets + __ESIMD_DNS::localAccessorToOffset(acc), pred);
783+
}
784+
769785
/// Accessor-based gather.
770786
/// Supported platforms: DG2, PVC
771787
/// VISA instruction: lsc_load.ugm
@@ -791,8 +807,10 @@ template <typename T, int NElts = 1,
791807
lsc_data_size DS = lsc_data_size::default_size,
792808
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
793809
int N, typename AccessorTy>
794-
__ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
795-
__ESIMD_NS::simd<T, N * NElts>>
810+
__ESIMD_API std::enable_if_t<
811+
!std::is_pointer_v<AccessorTy> &&
812+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
813+
__ESIMD_NS::simd<T, N * NElts>>
796814
lsc_gather(AccessorTy acc,
797815
#ifdef __ESIMD_FORCE_STATELESS_MEM
798816
__ESIMD_NS::simd<uint64_t, N> offsets,
@@ -833,10 +851,11 @@ template <typename T, int NElts = 1,
833851
lsc_data_size DS = lsc_data_size::default_size,
834852
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
835853
int N, typename AccessorTy, typename Toffset>
836-
__ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
837-
std::is_integral_v<Toffset> &&
838-
!std::is_same_v<Toffset, uint64_t>,
839-
__ESIMD_NS::simd<T, N * NElts>>
854+
__ESIMD_API std::enable_if_t<
855+
!std::is_pointer_v<AccessorTy> &&
856+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
857+
std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
858+
__ESIMD_NS::simd<T, N * NElts>>
840859
lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
841860
__ESIMD_NS::simd_mask<N> pred,
842861
__ESIMD_NS::simd<T, N * NElts> old_values) {
@@ -845,6 +864,20 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
845864
}
846865
#endif
847866

867+
template <typename T, int NElts = 1,
868+
lsc_data_size DS = lsc_data_size::default_size,
869+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
870+
int N, typename AccessorTy>
871+
__ESIMD_API std::enable_if_t<
872+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
873+
__ESIMD_NS::simd<T, N * NElts>>
874+
lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
875+
__ESIMD_NS::simd_mask<N> pred,
876+
__ESIMD_NS::simd<T, N * NElts> old_values) {
877+
return lsc_slm_gather<T, NElts, DS>(
878+
offsets + __ESIMD_DNS::localAccessorToOffset(acc), pred, old_values);
879+
}
880+
848881
/// USM pointer transposed gather with 1 channel.
849882
/// Supported platforms: DG2, PVC
850883
/// VISA instruction: lsc_load.ugm
@@ -1136,9 +1169,11 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
11361169
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
11371170
typename AccessorTy,
11381171
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1139-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1140-
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1141-
__ESIMD_NS::simd<T, NElts>>
1172+
__ESIMD_API std::enable_if_t<
1173+
!std::is_pointer<AccessorTy>::value &&
1174+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1175+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1176+
__ESIMD_NS::simd<T, NElts>>
11421177
lsc_block_load(AccessorTy acc,
11431178
#ifdef __ESIMD_FORCE_STATELESS_MEM
11441179
uint64_t offset,
@@ -1209,6 +1244,20 @@ lsc_block_load(AccessorTy acc,
12091244
#endif // !__ESIMD_FORCE_STATELESS_MEM
12101245
}
12111246

1247+
template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1248+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1249+
typename AccessorTy,
1250+
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1251+
__ESIMD_API std::enable_if_t<
1252+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1253+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1254+
__ESIMD_NS::simd<T, NElts>>
1255+
lsc_block_load(AccessorTy acc, uint32_t offset,
1256+
__ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1257+
return lsc_slm_block_load<T, NElts, DS>(
1258+
offset + __ESIMD_DNS::localAccessorToOffset(acc), pred);
1259+
}
1260+
12121261
/// A variation of lsc_block_load without predicate parameter to simplify use
12131262
/// of alignment parameter
12141263
///
@@ -1300,9 +1349,11 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
13001349
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
13011350
typename AccessorTy,
13021351
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1303-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1304-
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1305-
__ESIMD_NS::simd<T, NElts>>
1352+
__ESIMD_API std::enable_if_t<
1353+
!std::is_pointer<AccessorTy>::value &&
1354+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1355+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1356+
__ESIMD_NS::simd<T, NElts>>
13061357
lsc_block_load(AccessorTy acc,
13071358
#ifdef __ESIMD_FORCE_STATELESS_MEM
13081359
uint64_t offset,
@@ -1373,6 +1424,19 @@ lsc_block_load(AccessorTy acc,
13731424
#endif // !__ESIMD_FORCE_STATELESS_MEM
13741425
}
13751426

1427+
template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1428+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1429+
typename AccessorTy,
1430+
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1431+
__ESIMD_API std::enable_if_t<
1432+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1433+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1434+
__ESIMD_NS::simd<T, NElts>>
1435+
lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred,
1436+
__ESIMD_NS::simd<T, NElts> old_values, FlagsT flags = FlagsT{}) {
1437+
return lsc_slm_block_load<T, NElts, DS>(
1438+
offset + __ESIMD_DNS::localAccessorToOffset(acc), pred, old_values);
1439+
}
13761440
/// USM pointer prefetch gather.
13771441
/// Supported platforms: DG2, PVC
13781442
/// VISA instruction: lsc_load.ugm
@@ -1494,7 +1558,9 @@ template <typename T, int NElts = 1,
14941558
lsc_data_size DS = lsc_data_size::default_size,
14951559
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
14961560
int N, typename AccessorTy>
1497-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1561+
__ESIMD_API std::enable_if_t<
1562+
!std::is_pointer<AccessorTy>::value &&
1563+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
14981564
lsc_prefetch(AccessorTy acc,
14991565
#ifdef __ESIMD_FORCE_STATELESS_MEM
15001566
__ESIMD_NS::simd<uint64_t, N> offsets,
@@ -1528,9 +1594,10 @@ template <typename T, int NElts = 1,
15281594
lsc_data_size DS = lsc_data_size::default_size,
15291595
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
15301596
int N, typename AccessorTy, typename Toffset>
1531-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1532-
std::is_integral_v<Toffset> &&
1533-
!std::is_same_v<Toffset, uint64_t>>
1597+
__ESIMD_API std::enable_if_t<
1598+
!std::is_pointer<AccessorTy>::value &&
1599+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1600+
std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
15341601
lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
15351602
__ESIMD_NS::simd_mask<N> pred = 1) {
15361603
lsc_prefetch<T, NElts, DS, L1H, L3H, N, AccessorTy>(
@@ -1557,7 +1624,9 @@ template <typename T, int NElts = 1,
15571624
lsc_data_size DS = lsc_data_size::default_size,
15581625
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
15591626
typename AccessorTy>
1560-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1627+
__ESIMD_API std::enable_if_t<
1628+
!std::is_pointer<AccessorTy>::value &&
1629+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
15611630
lsc_prefetch(AccessorTy acc,
15621631
#ifdef __ESIMD_FORCE_STATELESS_MEM
15631632
uint64_t offset
@@ -1748,7 +1817,9 @@ template <typename T, int NElts = 1,
17481817
lsc_data_size DS = lsc_data_size::default_size,
17491818
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
17501819
int N, typename AccessorTy>
1751-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1820+
__ESIMD_API std::enable_if_t<
1821+
!std::is_pointer<AccessorTy>::value &&
1822+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
17521823
lsc_scatter(AccessorTy acc,
17531824
#ifdef __ESIMD_FORCE_STATELESS_MEM
17541825
__ESIMD_NS::simd<uint64_t, N> offsets,
@@ -1786,16 +1857,31 @@ template <typename T, int NElts = 1,
17861857
lsc_data_size DS = lsc_data_size::default_size,
17871858
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
17881859
int N, typename AccessorTy, typename Toffset>
1789-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1790-
std::is_integral_v<Toffset> &&
1791-
!std::is_same_v<Toffset, uint64_t>>
1860+
__ESIMD_API std::enable_if_t<
1861+
!std::is_pointer<AccessorTy>::value &&
1862+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
1863+
std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
17921864
lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
17931865
__ESIMD_NS::simd<T, N * NElts> vals,
17941866
__ESIMD_NS::simd_mask<N> pred = 1) {
17951867
lsc_scatter<T, NElts, DS, L1H, L3H, N, AccessorTy>(
17961868
acc, convert<uint64_t>(offsets), vals, pred);
17971869
}
17981870
#endif
1871+
1872+
template <typename T, int NElts = 1,
1873+
lsc_data_size DS = lsc_data_size::default_size,
1874+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1875+
int N, typename AccessorTy>
1876+
__ESIMD_API std::enable_if_t<
1877+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
1878+
lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
1879+
__ESIMD_NS::simd<T, N * NElts> vals,
1880+
__ESIMD_NS::simd_mask<N> pred = 1) {
1881+
lsc_slm_scatter<T, NElts, DS>(
1882+
offsets + __ESIMD_DNS::localAccessorToOffset(acc), vals, pred);
1883+
}
1884+
17991885
/// USM pointer transposed scatter with 1 channel.
18001886
/// Supported platforms: DG2, PVC
18011887
/// VISA instruction: lsc_store.ugm
@@ -1967,8 +2053,10 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
19672053
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
19682054
typename AccessorTy,
19692055
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1970-
__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1971-
__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2056+
__ESIMD_API std::enable_if_t<
2057+
!std::is_pointer<AccessorTy>::value &&
2058+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
2059+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
19722060
lsc_block_store(AccessorTy acc,
19732061
#ifdef __ESIMD_FORCE_STATELESS_MEM
19742062
uint64_t offset,
@@ -2042,6 +2130,19 @@ lsc_block_store(AccessorTy acc,
20422130
#endif
20432131
}
20442132

2133+
template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
2134+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2135+
typename AccessorTy,
2136+
typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
2137+
__ESIMD_API std::enable_if_t<
2138+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
2139+
__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2140+
lsc_block_store(AccessorTy acc, uint32_t offset,
2141+
__ESIMD_NS::simd<T, NElts> vals, FlagsT flags = FlagsT{}) {
2142+
lsc_slm_block_store<T, NElts, DS>(
2143+
offset + __ESIMD_DNS::localAccessorToOffset(acc), vals);
2144+
}
2145+
20452146
/// A variation of lsc_block_store without predicate parameter to simplify
20462147
/// use of alignment parameter
20472148
///
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
//==- lsc_local_accessor_block_load_store.cpp - DPC++ ESIMD on-device test ==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc
9+
// RUN: %{build} -o %t.out
10+
// RUN: %{run} %t.out
11+
//
12+
// TODO: Enable the test when GPU driver is ready/fixed.
13+
// XFAIL: opencl || windows || gpu-intel-pvc
14+
// UNSUPPORTED: esimd_emulator
15+
// This test verifies usage of block_load/block_store for local_accessor.
16+
17+
#include "../esimd_test_utils.hpp"
18+
19+
#include <sycl/ext/intel/esimd.hpp>
20+
#include <sycl/sycl.hpp>
21+
22+
#include <iostream>
23+
24+
using namespace sycl;
25+
using namespace sycl::ext::intel::esimd;
26+
using namespace sycl::ext::intel::experimental::esimd;
27+
28+
constexpr int VL = 16;
29+
30+
template <typename T>
31+
bool test(queue Q, uint32_t LocalRange, uint32_t GlobalRange) {
32+
std::cout << "Running case: T=" << esimd_test::type_name<T>() << std::endl;
33+
34+
// The test is going to use (LocalRange * VL) elements of T type.
35+
auto Dev = Q.get_device();
36+
auto DeviceSLMSize = Dev.get_info<sycl::info::device::local_mem_size>();
37+
if (DeviceSLMSize < LocalRange * VL * sizeof(T)) {
38+
// Report an error - the test needs a fix.
39+
std::cerr << "Error: Test needs more SLM memory than device has!"
40+
<< std::endl;
41+
return false;
42+
}
43+
44+
T *Out = malloc_shared<T>(GlobalRange * VL, Q);
45+
for (int I = 0; I < GlobalRange * VL; I++)
46+
Out[I] = -1;
47+
48+
try {
49+
nd_range<1> NDRange{range<1>{GlobalRange}, range<1>{LocalRange}};
50+
Q.submit([&](handler &CGH) {
51+
auto LocalAcc = local_accessor<T, 1>(LocalRange * VL, CGH);
52+
53+
CGH.parallel_for(NDRange, [=](nd_item<1> Item) SYCL_ESIMD_KERNEL {
54+
uint32_t GID = Item.get_global_id(0);
55+
uint32_t LID = Item.get_local_id(0);
56+
57+
simd<int, VL> IntValues(GID * 100, 1);
58+
simd<T, VL> ValuesToSLM = IntValues;
59+
lsc_block_store(LocalAcc, LID * VL * sizeof(T), ValuesToSLM);
60+
61+
Item.barrier();
62+
63+
if (LID == 0) {
64+
for (int LID = 0; LID < LocalRange; LID++) {
65+
simd<T, VL> ValuesFromSLM =
66+
lsc_block_load<T, VL>(LocalAcc, LID * VL * sizeof(T));
67+
ValuesFromSLM.copy_to(Out + (GID + LID) * VL);
68+
} // end for (int LID = 0; LID < LocalRange; LID++)
69+
} // end if (LID == 0)
70+
});
71+
}).wait();
72+
} catch (sycl::exception const &e) {
73+
std::cout << "SYCL exception caught: " << e.what() << '\n';
74+
free(Out, Q);
75+
return false;
76+
}
77+
78+
bool Pass = true;
79+
for (int I = 0; I < GlobalRange * VL; I++) {
80+
int GID = I / VL;
81+
int LID = GID % LocalRange;
82+
int VecElementIndex = I % VL;
83+
84+
T Expected = GID * 100 + VecElementIndex;
85+
T Computed = Out[I];
86+
if (Computed != Expected) {
87+
std::cout << "Error: Out[" << I << "]:" << Computed << " != " << Expected
88+
<< ":[expected]" << std::endl;
89+
Pass = false;
90+
}
91+
}
92+
93+
free(Out, Q);
94+
return Pass;
95+
}
96+
97+
int main() {
98+
auto Q = queue{gpu_selector_v};
99+
auto Dev = Q.get_device();
100+
auto DeviceSLMSize = Dev.get_info<sycl::info::device::local_mem_size>();
101+
std::cout << "Running on " << Dev.get_info<sycl::info::device::name>()
102+
<< ", Local memory size available : " << DeviceSLMSize << std::endl;
103+
104+
constexpr uint32_t LocalRange = 16;
105+
constexpr uint32_t GlobalRange = LocalRange * 2; // 2 groups.
106+
107+
bool Pass = true;
108+
Pass &= test<int>(Q, LocalRange, GlobalRange);
109+
Pass &= test<float>(Q, LocalRange, GlobalRange);
110+
111+
std::cout << "Test result: " << (Pass ? "Pass" : "Fail") << std::endl;
112+
return Pass ? 0 : 1;
113+
}

0 commit comments

Comments
 (0)