Skip to content

Commit 4537ce9

Browse files
Merge branch 'v5.0.x' into topic/5.0.x/handle-masync-assign-ctx
2 parents c09e947 + e25d3e6 commit 4537ce9

File tree

13 files changed

+127
-56
lines changed

13 files changed

+127
-56
lines changed

docs/tuning-apps/networking/rocm.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ using Open MPI and UCX ROCm support is something like this:
7979
.. code-block::
8080
8181
shell$ mpirun -n 2 --mca pml ucx \
82-
./osu_latency -d rocm D D
82+
./osu_latency D D
8383
8484
Note: some additional configure flags are required to compile the OSU
8585
benchmark to support ROCm buffers. Please refer to the `UCX ROCm

ompi/mca/coll/cuda/coll_cuda.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
5454
struct ompi_communicator_t *comm,
5555
mca_coll_base_module_t *module);
5656

57+
int mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count,
58+
struct ompi_datatype_t *dtype,
59+
struct ompi_op_t *op,
60+
mca_coll_base_module_t *module);
61+
5762
int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
5863
struct ompi_datatype_t *dtype,
5964
struct ompi_op_t *op,

ompi/mca/coll/cuda/coll_cuda_module.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ static void mca_coll_cuda_module_destruct(mca_coll_cuda_module_t *module)
4040
{
4141
OBJ_RELEASE(module->c_coll.coll_allreduce_module);
4242
OBJ_RELEASE(module->c_coll.coll_reduce_module);
43+
OBJ_RELEASE(module->c_coll.coll_reduce_local_module);
4344
OBJ_RELEASE(module->c_coll.coll_reduce_scatter_block_module);
4445
OBJ_RELEASE(module->c_coll.coll_scatter_module);
4546
/* If the exscan module is not NULL, then this was an
@@ -103,6 +104,7 @@ mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
103104
cuda_module->super.coll_gather = NULL;
104105
cuda_module->super.coll_gatherv = NULL;
105106
cuda_module->super.coll_reduce = mca_coll_cuda_reduce;
107+
cuda_module->super.coll_reduce_local = mca_coll_cuda_reduce_local;
106108
cuda_module->super.coll_reduce_scatter = NULL;
107109
cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block;
108110
cuda_module->super.coll_scan = mca_coll_cuda_scan;
@@ -135,6 +137,7 @@ int mca_coll_cuda_module_enable(mca_coll_base_module_t *module,
135137

136138
CHECK_AND_RETAIN(comm, s, allreduce);
137139
CHECK_AND_RETAIN(comm, s, reduce);
140+
CHECK_AND_RETAIN(comm, s, reduce_local);
138141
CHECK_AND_RETAIN(comm, s, reduce_scatter_block);
139142
CHECK_AND_RETAIN(comm, s, scatter);
140143
if (!OMPI_COMM_IS_INTER(comm)) {

ompi/mca/coll/cuda/coll_cuda_reduce.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,60 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
8383
}
8484
return rc;
8585
}
86+
87+
int
88+
mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count,
89+
struct ompi_datatype_t *dtype,
90+
struct ompi_op_t *op,
91+
mca_coll_base_module_t *module)
92+
{
93+
ptrdiff_t gap;
94+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
95+
size_t bufsize;
96+
int rc;
97+
98+
bufsize = opal_datatype_span(&dtype->super, count, &gap);
99+
100+
rc = mca_coll_cuda_check_buf((void *)sbuf);
101+
if (rc < 0) {
102+
return rc;
103+
}
104+
105+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
106+
sbuf1 = (char*)malloc(bufsize);
107+
if (NULL == sbuf1) {
108+
return OMPI_ERR_OUT_OF_RESOURCE;
109+
}
110+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
111+
sbuf = sbuf1 - gap;
112+
}
113+
114+
rc = mca_coll_cuda_check_buf(rbuf);
115+
if (rc < 0) {
116+
return rc;
117+
}
118+
119+
if (rc > 0) {
120+
rbuf1 = (char*)malloc(bufsize);
121+
if (NULL == rbuf1) {
122+
if (NULL != sbuf1) free(sbuf1);
123+
return OMPI_ERR_OUT_OF_RESOURCE;
124+
}
125+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
126+
rbuf2 = rbuf; /* save away original buffer */
127+
rbuf = rbuf1 - gap;
128+
}
129+
130+
ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype);
131+
rc = OMPI_SUCCESS;
132+
133+
if (NULL != sbuf1) {
134+
free(sbuf1);
135+
}
136+
if (NULL != rbuf1) {
137+
rbuf = rbuf2;
138+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
139+
free(rbuf1);
140+
}
141+
return rc;
142+
}

ompi/mca/pml/ob1/pml_ob1_isend.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ static inline int mca_pml_ob1_send_inline (const void *buf, size_t count,
143143
}
144144

145145
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
146-
return rc;
146+
return rc;
147147
}
148148

149149
return (int) size;

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr
382382
}
383383
}
384384

385-
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
385+
if (frag->retries < mca_pml_ob1.rdma_retries_limit &&
386386
OMPI_ERR_OUT_OF_RESOURCE == rc) {
387387
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
388388
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
@@ -413,6 +413,7 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_
413413
/* check completion status */
414414
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
415415
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
416+
/* fragment was returned or queue by the above call */
416417
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
417418
size_t skipped_bytes = recvreq->req_send_offset - recvreq->req_rdma_offset;
418419
opal_output_verbose(mca_pml_ob1_output, 1, "pml:ob1: %s: operation failed with code %d", __func__, status);
@@ -435,12 +436,12 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_
435436
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
436437
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
437438
frag->rdma_length, 0, 0);
439+
440+
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
438441
}
439442

440443
recv_request_pml_complete_check(recvreq);
441444

442-
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
443-
444445
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
445446
}
446447

ompi/mca/pml/ob1/pml_ob1_sendreq.c

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* Copyright (c) 2018-2019 Triad National Security, LLC. All rights
2323
* reserved.
2424
* Copyright (c) 2022 IBM Corporation. All rights reserved.
25+
* Copyright (c) 2024 Google, LLC. All rights reserved.
2526
* $COPYRIGHT$
2627
*
2728
* Additional copyrights may follow
@@ -1110,6 +1111,12 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
11101111

11111112
range = get_send_range(sendreq);
11121113

1114+
if (NULL != sendreq->rdma_frag) {
1115+
/* this request was first attempted with RDMA but is now using send/recv */
1116+
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
1117+
sendreq->rdma_frag = NULL;
1118+
}
1119+
11131120
while(range && (false == sendreq->req_throttle_sends ||
11141121
sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
11151122
mca_pml_ob1_frag_hdr_t* hdr;
@@ -1268,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f
12681275
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
12691276
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
12701277

1271-
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
1278+
if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
12721279
/* queue the frag for later if there was a resource error */
12731280
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
12741281
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
12751282
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
1276-
} else {
1283+
return;
1284+
}
1285+
12771286
#if OPAL_ENABLE_FT
1278-
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
1279-
return;
1280-
}
1281-
#endif /* OPAL_ENABLE_FT */
1282-
/* tell receiver to deregister memory */
1283-
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1284-
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
1285-
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
1286-
1287-
/* send fragment by copy in/out */
1288-
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
1289-
frag->rdma_length);
1290-
/* if a pointer to a receive request is not set it means that
1291-
* ACK was not yet received. Don't schedule sends before ACK */
1292-
if (NULL != sendreq->req_recv.pval)
1293-
mca_pml_ob1_send_request_schedule (sendreq);
1287+
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
1288+
return;
12941289
}
1290+
#endif /* OPAL_ENABLE_FT */
1291+
/* tell receiver to deregister memory */
1292+
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1293+
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
1294+
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
1295+
1296+
/* send fragment by copy in/out */
1297+
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
1298+
frag->rdma_length);
1299+
/* if a pointer to a receive request is not set it means that
1300+
* ACK was not yet received. Don't schedule sends before ACK */
1301+
if (NULL != sendreq->req_recv.pval)
1302+
mca_pml_ob1_send_request_schedule (sendreq);
12951303
}
12961304

12971305
/**

opal/datatype/opal_datatype_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ struct opal_datatype_t;
539539
# define OPAL_DATATYPE_SAFEGUARD_POINTER(ACTPTR, LENGTH, INITPTR, PDATA, COUNT) \
540540
{ \
541541
unsigned char *__lower_bound = (INITPTR), *__upper_bound; \
542-
assert(((LENGTH) != 0) && ((COUNT) != 0)); \
542+
assert( (COUNT) != 0 ); \
543543
__lower_bound += (PDATA)->true_lb; \
544544
__upper_bound = (INITPTR) + (PDATA)->true_ub + \
545545
((PDATA)->ub - (PDATA)->lb) * ((COUNT) -1); \

opal/datatype/opal_datatype_position.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ static inline void position_single_block(opal_convertor_t *CONVERTOR, unsigned c
6666
}
6767

6868
/**
69-
* Advance the convertors' position according. Update the pointer and the remaining space
70-
* accordingly.
69+
* Advance the convertors' position according to account for *COUNT elements. Update
70+
* the pointer and the remaining space accordingly.
7171
*/
7272
static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem_desc_t *ELEM,
7373
size_t *COUNT, unsigned char **POINTER, size_t *SPACE)
@@ -82,7 +82,8 @@ static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem
8282

8383
if (cando_count > *(COUNT)) {
8484
cando_count = *(COUNT);
85-
}
85+
} else if( 0 == cando_count )
86+
return;
8687

8788
if (1 == _elem->blocklen) {
8889
DO_DEBUG(opal_output(0,

opal/mca/btl/sm/btl_sm_send.c

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,4 @@ int mca_btl_sm_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpo
7373
}
7474

7575
return OPAL_SUCCESS;
76-
77-
#if 0
78-
if (((frag->hdr->flags & MCA_BTL_SM_FLAG_SINGLE_COPY) ||
79-
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) &&
80-
frag->base.des_cbfunc) {
81-
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
82-
83-
return OPAL_SUCCESS;
84-
}
85-
86-
/* data is gone (from the pml's perspective). frag callback/release will
87-
happen later */
88-
return 1;
89-
#endif
9076
}

0 commit comments

Comments
 (0)