@@ -487,13 +487,20 @@ sycl::event _populate_packed_shape_strides_for_copycast_kernel(
487
487
const std::vector<py::ssize_t > &src_strides,
488
488
const std::vector<py::ssize_t > &dst_strides)
489
489
{
490
- using shT = std::vector<py::ssize_t >;
490
+ // memory transfer optimization, use USM-host for temporary speeds up
491
+ // tranfer to device, especially on dGPUs
492
+ using usm_host_allocatorT =
493
+ sycl::usm_allocator<py::ssize_t , sycl::usm::alloc::host>;
494
+ using shT = std::vector<py::ssize_t , usm_host_allocatorT>;
491
495
size_t nd = common_shape.size ();
492
496
497
+ usm_host_allocatorT allocator (exec_q);
498
+
493
499
// create host temporary for packed shape and strides managed by shared
494
500
// pointer. Packed vector is concatenation of common_shape, src_stride and
495
501
// std_strides
496
- std::shared_ptr<shT> shp_host_shape_strides = std::make_shared<shT>(3 * nd);
502
+ std::shared_ptr<shT> shp_host_shape_strides =
503
+ std::make_shared<shT>(3 * nd, allocator);
497
504
std::copy (common_shape.begin (), common_shape.end (),
498
505
shp_host_shape_strides->begin ());
499
506
@@ -943,9 +950,12 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
943
950
throw std::runtime_error (" Unabled to allocate device memory" );
944
951
}
945
952
946
- using shT = std::vector<py::ssize_t >;
953
+ using usm_host_allocatorT =
954
+ sycl::usm_allocator<py::ssize_t , sycl::usm::alloc::host>;
955
+ using shT = std::vector<py::ssize_t , usm_host_allocatorT>;
956
+ usm_host_allocatorT allocator (exec_q);
947
957
std::shared_ptr<shT> packed_host_shapes_strides_shp =
948
- std::make_shared<shT>(2 * (src_nd + dst_nd));
958
+ std::make_shared<shT>(2 * (src_nd + dst_nd), allocator );
949
959
950
960
std::copy (src_shape, src_shape + src_nd,
951
961
packed_host_shapes_strides_shp->begin ());
@@ -956,13 +966,13 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
956
966
if (src_strides == nullptr ) {
957
967
int src_flags = src.get_flags ();
958
968
if (src_flags & USM_ARRAY_C_CONTIGUOUS) {
959
- const shT &src_contig_strides =
969
+ const auto &src_contig_strides =
960
970
c_contiguous_strides (src_nd, src_shape);
961
971
std::copy (src_contig_strides.begin (), src_contig_strides.end (),
962
972
packed_host_shapes_strides_shp->begin () + src_nd);
963
973
}
964
974
else if (src_flags & USM_ARRAY_F_CONTIGUOUS) {
965
- const shT &src_contig_strides =
975
+ const auto &src_contig_strides =
966
976
c_contiguous_strides (src_nd, src_shape);
967
977
std::copy (src_contig_strides.begin (), src_contig_strides.end (),
968
978
packed_host_shapes_strides_shp->begin () + src_nd);
@@ -982,14 +992,14 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
982
992
if (dst_strides == nullptr ) {
983
993
int dst_flags = dst.get_flags ();
984
994
if (dst_flags & USM_ARRAY_C_CONTIGUOUS) {
985
- const shT &dst_contig_strides =
995
+ const auto &dst_contig_strides =
986
996
c_contiguous_strides (dst_nd, dst_shape);
987
997
std::copy (dst_contig_strides.begin (), dst_contig_strides.end (),
988
998
packed_host_shapes_strides_shp->begin () + 2 * src_nd +
989
999
dst_nd);
990
1000
}
991
1001
else if (dst_flags & USM_ARRAY_F_CONTIGUOUS) {
992
- const shT &dst_contig_strides =
1002
+ const auto &dst_contig_strides =
993
1003
f_contiguous_strides (dst_nd, dst_shape);
994
1004
std::copy (dst_contig_strides.begin (), dst_contig_strides.end (),
995
1005
packed_host_shapes_strides_shp->begin () + 2 * src_nd +
@@ -1349,7 +1359,12 @@ void copy_numpy_ndarray_into_usm_ndarray(
1349
1359
throw std::runtime_error (" Unabled to allocate device memory" );
1350
1360
}
1351
1361
1352
- std::shared_ptr<shT> host_shape_strides_shp = std::make_shared<shT>(3 * nd);
1362
+ using usm_host_allocatorT =
1363
+ sycl::usm_allocator<py::ssize_t , sycl::usm::alloc::host>;
1364
+ using usmshT = std::vector<py::ssize_t , usm_host_allocatorT>;
1365
+ usm_host_allocatorT alloc (exec_q);
1366
+
1367
+ auto host_shape_strides_shp = std::make_shared<usmshT>(3 * nd, alloc);
1353
1368
std::copy (simplified_shape.begin (), simplified_shape.end (),
1354
1369
host_shape_strides_shp->begin ());
1355
1370
std::copy (simplified_src_strides.begin (), simplified_src_strides.end (),
@@ -2023,9 +2038,10 @@ tri(sycl::queue &exec_q,
2023
2038
return std::make_pair (sycl::event (), sycl::event ());
2024
2039
}
2025
2040
2026
- // check that arrays do not overlap, and concurrent copying is safe.
2027
2041
char *src_data = src.get_data ();
2028
2042
char *dst_data = dst.get_data ();
2043
+
2044
+ // check that arrays do not overlap, and concurrent copying is safe.
2029
2045
auto src_offsets = src.get_minmax_offsets ();
2030
2046
auto dst_offsets = dst.get_minmax_offsets ();
2031
2047
int src_elem_size = src.get_elemsize ();
@@ -2045,6 +2061,7 @@ tri(sycl::queue &exec_q,
2045
2061
int dst_typenum = dst.get_typenum ();
2046
2062
int src_typeid = array_types.typenum_to_lookup_id (src_typenum);
2047
2063
int dst_typeid = array_types.typenum_to_lookup_id (dst_typenum);
2064
+
2048
2065
if (dst_typeid != src_typeid) {
2049
2066
throw py::value_error (" Array dtype are not the same." );
2050
2067
}
@@ -2059,11 +2076,13 @@ tri(sycl::queue &exec_q,
2059
2076
}
2060
2077
2061
2078
using shT = std::vector<py::ssize_t >;
2062
- int src_flags = src.get_flags ();
2063
- const py::ssize_t *src_strides_raw = src.get_strides_raw ();
2064
2079
shT src_strides (src_nd);
2080
+
2081
+ int src_flags = src.get_flags ();
2065
2082
bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0 );
2066
2083
bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0 );
2084
+
2085
+ const py::ssize_t *src_strides_raw = src.get_strides_raw ();
2067
2086
if (src_strides_raw == nullptr ) {
2068
2087
if (is_src_c_contig) {
2069
2088
src_strides = c_contiguous_strides (src_nd, src_shape);
@@ -2081,11 +2100,13 @@ tri(sycl::queue &exec_q,
2081
2100
src_strides.begin ());
2082
2101
}
2083
2102
2084
- int dst_flags = dst.get_flags ();
2085
- const py::ssize_t *dst_strides_raw = dst.get_strides_raw ();
2086
2103
shT dst_strides (src_nd);
2104
+
2105
+ int dst_flags = dst.get_flags ();
2087
2106
bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0 );
2088
2107
bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0 );
2108
+
2109
+ const py::ssize_t *dst_strides_raw = dst.get_strides_raw ();
2089
2110
if (dst_strides_raw == nullptr ) {
2090
2111
if (is_dst_c_contig) {
2091
2112
dst_strides = c_contiguous_strides (src_nd, src_shape);
@@ -2128,23 +2149,29 @@ tri(sycl::queue &exec_q,
2128
2149
}
2129
2150
2130
2151
nd += 2 ;
2131
- std::vector<py::ssize_t > shape_and_strides (3 * nd);
2152
+
2153
+ using usm_host_allocatorT =
2154
+ sycl::usm_allocator<py::ssize_t , sycl::usm::alloc::host>;
2155
+ using usmshT = std::vector<py::ssize_t , usm_host_allocatorT>;
2156
+
2157
+ usm_host_allocatorT allocator (exec_q);
2158
+ auto shp_host_shape_and_strides =
2159
+ std::make_shared<usmshT>(3 * nd, allocator);
2132
2160
2133
2161
std::copy (simplified_shape.begin (), simplified_shape.end (),
2134
- shape_and_strides.begin ());
2135
- shape_and_strides[nd - 2 ] = src_shape[src_nd - 2 ];
2136
- shape_and_strides[nd - 1 ] = src_shape[src_nd - 1 ];
2162
+ shp_host_shape_and_strides->begin ());
2163
+ (*shp_host_shape_and_strides)[nd - 2 ] = src_shape[src_nd - 2 ];
2164
+ (*shp_host_shape_and_strides)[nd - 1 ] = src_shape[src_nd - 1 ];
2165
+
2137
2166
std::copy (simplified_src_strides.begin (), simplified_src_strides.end (),
2138
- shape_and_strides.begin () + nd);
2139
- shape_and_strides[2 * nd - 2 ] = src_strides[src_nd - 2 ];
2140
- shape_and_strides[2 * nd - 1 ] = src_strides[src_nd - 1 ];
2141
- std::copy (simplified_dst_strides.begin (), simplified_dst_strides.end (),
2142
- shape_and_strides.begin () + 2 * nd);
2143
- shape_and_strides[3 * nd - 2 ] = dst_strides[src_nd - 2 ];
2144
- shape_and_strides[3 * nd - 1 ] = dst_strides[src_nd - 1 ];
2167
+ shp_host_shape_and_strides->begin () + nd);
2168
+ (*shp_host_shape_and_strides)[2 * nd - 2 ] = src_strides[src_nd - 2 ];
2169
+ (*shp_host_shape_and_strides)[2 * nd - 1 ] = src_strides[src_nd - 1 ];
2145
2170
2146
- std::shared_ptr<shT> shp_host_shape_and_strides =
2147
- std::make_shared<shT>(shape_and_strides);
2171
+ std::copy (simplified_dst_strides.begin (), simplified_dst_strides.end (),
2172
+ shp_host_shape_and_strides->begin () + 2 * nd);
2173
+ (*shp_host_shape_and_strides)[3 * nd - 2 ] = dst_strides[src_nd - 2 ];
2174
+ (*shp_host_shape_and_strides)[3 * nd - 1 ] = dst_strides[src_nd - 1 ];
2148
2175
2149
2176
py::ssize_t *dev_shape_and_strides =
2150
2177
sycl::malloc_device<ssize_t >(3 * nd, exec_q);
@@ -2154,8 +2181,7 @@ tri(sycl::queue &exec_q,
2154
2181
sycl::event copy_shape_and_strides = exec_q.copy <ssize_t >(
2155
2182
shp_host_shape_and_strides->data (), dev_shape_and_strides, 3 * nd);
2156
2183
2157
- py::ssize_t inner_range =
2158
- shape_and_strides[nd - 1 ] * shape_and_strides[nd - 2 ];
2184
+ py::ssize_t inner_range = src_shape[src_nd - 1 ] * src_shape[src_nd - 2 ];
2159
2185
py::ssize_t outer_range = src_nelems / inner_range;
2160
2186
2161
2187
sycl::event tri_ev;
@@ -2182,6 +2208,7 @@ tri(sycl::queue &exec_q,
2182
2208
sycl::free (dev_shape_and_strides, ctx);
2183
2209
});
2184
2210
});
2211
+
2185
2212
return std::make_pair (keep_args_alive (exec_q, {src, dst}, {tri_ev}),
2186
2213
tri_ev);
2187
2214
}
0 commit comments