@@ -36,7 +36,7 @@ mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args,
36
36
int root_low_rank ,
37
37
struct ompi_communicator_t * up_comm ,
38
38
struct ompi_communicator_t * low_comm ,
39
- int w_rank , bool noop , ompi_request_t * req )
39
+ int w_rank , bool noop , bool is_mapbycore , ompi_request_t * req )
40
40
{
41
41
args -> cur_task = cur_task ;
42
42
args -> sbuf = sbuf ;
@@ -53,6 +53,7 @@ mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args,
53
53
args -> low_comm = low_comm ;
54
54
args -> w_rank = w_rank ;
55
55
args -> noop = noop ;
56
+ args -> is_mapbycore = is_mapbycore ;
56
57
args -> req = req ;
57
58
}
58
59
@@ -70,7 +71,6 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
70
71
int root_low_rank , root_up_rank ; /* root ranks for both sub-communicators */
71
72
char * reorder_buf = NULL , * reorder_rbuf = NULL ;
72
73
int i , err , * vranks , low_rank , low_size , * topo ;
73
- ptrdiff_t rsize , rgap = 0 , rextent ;
74
74
ompi_request_t * temp_request = NULL ;
75
75
76
76
/* Create the subcommunicators */
@@ -100,6 +100,7 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
100
100
comm , comm -> c_coll -> coll_gather_module );
101
101
}
102
102
103
+ ompi_datatype_t * dtype = (w_rank == root ) ? rdtype : sdtype ;
103
104
w_rank = ompi_comm_rank (comm );
104
105
w_size = ompi_comm_size (comm );
105
106
/* Set up request */
@@ -128,7 +129,6 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
128
129
"[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n" ,
129
130
w_rank , root , root_low_rank , root_up_rank ));
130
131
131
- ompi_datatype_type_extent (rdtype , & rextent );
132
132
133
133
/* Allocate reorder buffers */
134
134
if (w_rank == root ) {
@@ -142,12 +142,25 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
142
142
143
143
} else {
144
144
/* Need a buffer to store unordered final result */
145
+ ptrdiff_t rsize , rgap ;
145
146
rsize = opal_datatype_span (& rdtype -> super ,
146
147
(int64_t )rcount * w_size ,
147
148
& rgap );
148
149
reorder_buf = (char * )malloc (rsize ); //TODO:free
149
150
/* rgap is the size of unused space at the start of the datatype */
150
151
reorder_rbuf = reorder_buf - rgap ;
152
+
153
+ if (MPI_IN_PLACE == sbuf ) {
154
+ ptrdiff_t rextent ;
155
+ ompi_datatype_type_extent (rdtype , & rextent );
156
+ ptrdiff_t block_size = rextent * (ptrdiff_t )rcount ;
157
+ ptrdiff_t src_shift = block_size * w_rank ;
158
+ ptrdiff_t dest_shift = block_size * w_rank ;
159
+ ompi_datatype_copy_content_same_ddt (dtype ,
160
+ (ptrdiff_t )rcount ,
161
+ (char * )rbuf + dest_shift ,
162
+ reorder_rbuf + src_shift );
163
+ }
151
164
}
152
165
}
153
166
@@ -158,7 +171,7 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
158
171
mca_coll_han_gather_args_t * lg_args = malloc (sizeof (mca_coll_han_gather_args_t ));
159
172
mca_coll_han_set_gather_args (lg_args , lg , (char * ) sbuf , NULL , scount , sdtype , reorder_rbuf ,
160
173
rcount , rdtype , root , root_up_rank , root_low_rank , up_comm ,
161
- low_comm , w_rank , low_rank != root_low_rank , temp_request );
174
+ low_comm , w_rank , low_rank != root_low_rank , han_module -> is_mapbycore , temp_request );
162
175
/* Init lg task */
163
176
init_task (lg , mca_coll_han_gather_lg_task , (void * ) (lg_args ));
164
177
/* Issure lg task */
@@ -176,6 +189,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
176
189
*/
177
190
/* reorder rbuf based on rank */
178
191
if (w_rank == root && !han_module -> is_mapbycore ) {
192
+ ptrdiff_t rextent ;
193
+ ompi_datatype_type_extent (rdtype , & rextent );
179
194
for (i = 0 ; i < w_size ; i ++ ) {
180
195
OPAL_OUTPUT_VERBOSE ((30 , mca_coll_han_component .han_output ,
181
196
"[%d]: Han Gather copy from %d to %d\n" ,
@@ -202,6 +217,15 @@ int mca_coll_han_gather_lg_task(void *task_args)
202
217
mca_coll_han_gather_args_t * t = (mca_coll_han_gather_args_t * ) task_args ;
203
218
OPAL_OUTPUT_VERBOSE ((30 , mca_coll_han_component .han_output , "[%d] Han Gather: lg\n" ,
204
219
t -> w_rank ));
220
+ ompi_datatype_t * dtype ;
221
+ size_t count ;
222
+ if (t -> w_rank == t -> root ) {
223
+ dtype = t -> rdtype ;
224
+ count = t -> rcount ;
225
+ } else {
226
+ dtype = t -> sdtype ;
227
+ count = t -> scount ;
228
+ }
205
229
206
230
/* If the process is one of the node leader */
207
231
char * tmp_buf = NULL ;
@@ -210,21 +234,35 @@ int mca_coll_han_gather_lg_task(void *task_args)
210
234
/* if the process is one of the node leader, allocate the intermediary
211
235
* buffer to gather on the low sub communicator */
212
236
int low_size = ompi_comm_size (t -> low_comm );
237
+ int low_rank = ompi_comm_rank (t -> low_comm );
213
238
ptrdiff_t rsize , rgap = 0 ;
214
- rsize = opal_datatype_span (& t -> rdtype -> super ,
215
- ( int64_t ) t -> rcount * low_size ,
239
+ rsize = opal_datatype_span (& dtype -> super ,
240
+ count * low_size ,
216
241
& rgap );
217
242
tmp_buf = (char * ) malloc (rsize );
218
243
tmp_rbuf = tmp_buf - rgap ;
244
+ if (t -> w_rank == t -> root ) {
245
+ if (t -> is_mapbycore && MPI_IN_PLACE == t -> sbuf ) {
246
+ ptrdiff_t rextent ;
247
+ ompi_datatype_type_extent (dtype , & rextent );
248
+ ptrdiff_t block_size = rextent * (ptrdiff_t )count ;
249
+ ptrdiff_t src_shift = block_size * t -> w_rank ;
250
+ ptrdiff_t dest_shift = block_size * low_rank ;
251
+ ompi_datatype_copy_content_same_ddt (dtype ,
252
+ (ptrdiff_t )count ,
253
+ tmp_rbuf + dest_shift ,
254
+ (char * )t -> rbuf + src_shift );
255
+ }
256
+ }
219
257
}
220
258
221
259
/* Low level (usually intra-node or shared memory) node gather */
222
260
t -> low_comm -> c_coll -> coll_gather ((char * )t -> sbuf ,
223
- t -> scount ,
224
- t -> sdtype ,
261
+ count ,
262
+ dtype ,
225
263
tmp_rbuf ,
226
- t -> rcount ,
227
- t -> rdtype ,
264
+ count ,
265
+ dtype ,
228
266
t -> root_low_rank ,
229
267
t -> low_comm ,
230
268
t -> low_comm -> c_coll -> coll_gather_module );
@@ -253,14 +291,25 @@ int mca_coll_han_gather_ug_task(void *task_args)
253
291
OPAL_OUTPUT_VERBOSE ((30 , mca_coll_han_component .han_output ,
254
292
"[%d] Han Gather: ug noop\n" , t -> w_rank ));
255
293
} else {
294
+ ompi_datatype_t * dtype ;
295
+ size_t count ;
296
+ if (t -> w_rank == t -> root ) {
297
+ dtype = t -> rdtype ;
298
+ count = t -> rcount ;
299
+ } else {
300
+ dtype = t -> sdtype ;
301
+ count = t -> scount ;
302
+ }
303
+
304
+
256
305
int low_size = ompi_comm_size (t -> low_comm );
257
306
/* inter node gather */
258
307
t -> up_comm -> c_coll -> coll_gather ((char * )t -> sbuf ,
259
- t -> scount * low_size ,
260
- t -> sdtype ,
308
+ count * low_size ,
309
+ dtype ,
261
310
(char * )t -> rbuf ,
262
- t -> rcount * low_size ,
263
- t -> rdtype ,
311
+ count * low_size ,
312
+ dtype ,
264
313
t -> root_up_rank ,
265
314
t -> up_comm ,
266
315
t -> up_comm -> c_coll -> coll_gather_module );
@@ -320,6 +369,17 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
320
369
321
370
ompi_communicator_t * low_comm = han_module -> sub_comm [INTRA_NODE ];
322
371
ompi_communicator_t * up_comm = han_module -> sub_comm [INTER_NODE ];
372
+ ompi_datatype_t * dtype ;
373
+ size_t count ;
374
+
375
+ if (w_rank == root ) {
376
+ dtype = rdtype ;
377
+ count = rcount ;
378
+ } else {
379
+ dtype = sdtype ;
380
+ count = scount ;
381
+ }
382
+
323
383
324
384
/* Get the 'virtual ranks' mapping corresponding to the communicators */
325
385
int * vranks = han_module -> cached_vranks ;
@@ -359,32 +419,32 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
359
419
char * tmp_buf_start = NULL ; // start of the data
360
420
if (low_rank == root_low_rank ) {
361
421
ptrdiff_t rsize , rgap = 0 ;
362
- rsize = opal_datatype_span (& rdtype -> super ,
363
- ( int64_t ) rcount * low_size ,
422
+ rsize = opal_datatype_span (& dtype -> super ,
423
+ count * low_size ,
364
424
& rgap );
365
425
tmp_buf = (char * ) malloc (rsize );
366
426
tmp_buf_start = tmp_buf - rgap ;
367
427
}
368
428
369
429
/* 1. low gather on nodes leaders */
370
430
low_comm -> c_coll -> coll_gather ((char * )sbuf ,
371
- scount ,
372
- sdtype ,
431
+ count ,
432
+ dtype ,
373
433
tmp_buf_start ,
374
- rcount ,
375
- rdtype ,
434
+ count ,
435
+ dtype ,
376
436
root_low_rank ,
377
437
low_comm ,
378
438
low_comm -> c_coll -> coll_gather_module );
379
439
380
440
/* 2. upper gather (inter-node) between node leaders */
381
441
if (low_rank == root_low_rank ) {
382
442
up_comm -> c_coll -> coll_gather ((char * )tmp_buf_start ,
383
- scount * low_size ,
384
- sdtype ,
443
+ count * low_size ,
444
+ dtype ,
385
445
(char * )reorder_buf_start ,
386
- rcount * low_size ,
387
- rdtype ,
446
+ count * low_size ,
447
+ dtype ,
388
448
root_up_rank ,
389
449
up_comm ,
390
450
up_comm -> c_coll -> coll_gather_module );
@@ -425,15 +485,15 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
425
485
void
426
486
ompi_coll_han_reorder_gather (const void * sbuf ,
427
487
void * rbuf , int rcount ,
428
- struct ompi_datatype_t * rdtype ,
488
+ struct ompi_datatype_t * dtype ,
429
489
struct ompi_communicator_t * comm ,
430
490
int * topo )
431
491
{
432
492
int i , topolevel = 2 ; // always 2 levels in topo
433
493
int w_rank = ompi_comm_rank (comm );
434
494
int w_size = ompi_comm_size (comm );
435
495
ptrdiff_t rextent ;
436
- ompi_datatype_type_extent (rdtype , & rextent );
496
+ ompi_datatype_type_extent (dtype , & rextent );
437
497
for ( i = 0 ; i < w_size ; i ++ ) {
438
498
OPAL_OUTPUT_VERBOSE ((30 , mca_coll_han_component .han_output ,
439
499
"[%d]: Future reorder from %d to %d\n" ,
@@ -443,7 +503,7 @@ ompi_coll_han_reorder_gather(const void *sbuf,
443
503
ptrdiff_t block_size = rextent * (ptrdiff_t )rcount ;
444
504
ptrdiff_t src_shift = block_size * i ;
445
505
ptrdiff_t dest_shift = block_size * (ptrdiff_t )topo [i * topolevel + 1 ];
446
- ompi_datatype_copy_content_same_ddt (rdtype ,
506
+ ompi_datatype_copy_content_same_ddt (dtype ,
447
507
(ptrdiff_t )rcount ,
448
508
(char * )rbuf + dest_shift ,
449
509
(char * )sbuf + src_shift );
0 commit comments