Skip to content

Commit e42bfc1

Browse files
authored
Merge branch 'main' into bugfix/opal-ucx-variable-segfault
2 parents dfdc992 + f6674c0 commit e42bfc1

File tree

5 files changed

+31
-41
lines changed

5 files changed

+31
-41
lines changed

ompi/mca/mtl/ofi/mtl_ofi.c

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,6 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
227227
size_t namelen;
228228
int count = 0;
229229
char *ep_name = NULL;
230-
char *ep_names = NULL;
231230
fi_addr_t *fi_addrs = NULL;
232231
mca_mtl_ofi_endpoint_t *endpoint = NULL;
233232
int num_peers_limit = (1 << ompi_mtl_ofi.num_bits_source_rank) - 1;
@@ -246,15 +245,6 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
246245
goto bail;
247246
}
248247

249-
/**
250-
* Create array of EP names.
251-
*/
252-
ep_names = malloc(nprocs * namelen);
253-
if (NULL == ep_names) {
254-
ret = OMPI_ERROR;
255-
goto bail;
256-
}
257-
258248
/**
259249
* Create array of fi_addrs.
260250
*/
@@ -264,10 +254,10 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
264254
goto bail;
265255
}
266256

267-
/**
268-
* Retrieve the processes' EP names from modex.
269-
*/
270257
for (i = 0; i < nprocs; ++i) {
258+
/**
259+
* Retrieve the processes' EP name from modex.
260+
*/
271261
OFI_COMPAT_MODEX_RECV(ret,
272262
&mca_mtl_ofi_component.super.mtl_version,
273263
procs[i],
@@ -281,19 +271,18 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
281271
free(errhost);
282272
goto bail;
283273
}
284-
memcpy(&ep_names[i*namelen], ep_name, namelen);
285-
}
286274

287-
/**
288-
* Map the EP names to fi_addrs.
289-
*/
290-
count = fi_av_insert(ompi_mtl_ofi.av, ep_names, nprocs, fi_addrs, 0, NULL);
291-
if ((count < 0) || (nprocs != (size_t)count)) {
292-
opal_output_verbose(1, opal_common_ofi.output,
293-
"%s:%d: fi_av_insert failed: %d\n",
294-
__FILE__, __LINE__, count);
295-
ret = OMPI_ERROR;
296-
goto bail;
275+
/**
276+
* Map the EP name to fi_addr.
277+
*/
278+
count = fi_av_insert(ompi_mtl_ofi.av, ep_name, 1, &fi_addrs[i], 0, NULL);
279+
if ((count < 0) || (1 != (size_t)count)) {
280+
opal_output_verbose(1, opal_common_ofi.output,
281+
"%s:%d: fi_av_insert failed for address %s: %d\n",
282+
__FILE__, __LINE__, ep_name, count);
283+
ret = OMPI_ERROR;
284+
goto bail;
285+
}
297286
}
298287

299288
/**
@@ -326,9 +315,6 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
326315
if (fi_addrs)
327316
free(fi_addrs);
328317

329-
if (ep_names)
330-
free(ep_names);
331-
332318
return ret;
333319
}
334320

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ int ompi_mtl_ofi_progress_no_inline(void);
7474

7575
#if OPAL_HAVE_THREAD_LOCAL
7676
extern opal_thread_local int ompi_mtl_ofi_per_thread_ctx;
77-
extern opal_thread_local struct fi_cq_tagged_entry ompi_mtl_ofi_wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
7877
#endif
7978

8079
#define MCA_MTL_OFI_CID_NOT_EXCHANGED 2
@@ -136,9 +135,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
136135
ompi_mtl_ofi_request_t *ofi_req = NULL;
137136
struct fi_cq_err_entry error = { 0 };
138137
ssize_t ret;
139-
#if !OPAL_HAVE_THREAD_LOCAL
140138
struct fi_cq_tagged_entry ompi_mtl_ofi_wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
141-
#endif
142139

143140
/**
144141
* Read the work completions from the CQ.

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ static int ofi_tag_mode;
3939

4040
#if OPAL_HAVE_THREAD_LOCAL
4141
opal_thread_local int ompi_mtl_ofi_per_thread_ctx;
42-
opal_thread_local struct fi_cq_tagged_entry ompi_mtl_ofi_wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
4342
#endif
4443

4544
/*

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ int mca_btl_ofi_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp
6363
mca_btl_ofi_rdma_completion_t *comp = NULL;
6464
mca_btl_ofi_context_t *ofi_context;
6565

66+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
6667
ofi_context = get_ofi_context(ofi_btl);
6768

6869
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -87,16 +88,16 @@ int mca_btl_ofi_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp
8788
fi_datatype, fi_op, &comp->comp_ctx);
8889

8990
if (rc == -FI_EAGAIN) {
91+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9092
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9193
return OPAL_ERR_OUT_OF_RESOURCE;
9294
} else if (rc < 0) {
95+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9396
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9497
BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
9598
MCA_BTL_OFI_ABORT();
9699
}
97100

98-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
99-
100101
return OPAL_SUCCESS;
101102
}
102103

@@ -114,6 +115,7 @@ int mca_btl_ofi_aop(struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
114115
mca_btl_ofi_rdma_completion_t *comp = NULL;
115116
mca_btl_ofi_context_t *ofi_context;
116117

118+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
117119
ofi_context = get_ofi_context(ofi_btl);
118120

119121
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -136,16 +138,16 @@ int mca_btl_ofi_aop(struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
136138
fi_datatype, fi_op, &comp->comp_ctx);
137139

138140
if (rc == -FI_EAGAIN) {
141+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
139142
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
140143
return OPAL_ERR_OUT_OF_RESOURCE;
141144
} else if (rc < 0) {
145+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
142146
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
143147
BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
144148
MCA_BTL_OFI_ABORT();
145149
}
146150

147-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
148-
149151
return OPAL_SUCCESS;
150152
}
151153

@@ -165,6 +167,7 @@ int mca_btl_ofi_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_en
165167
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
166168
mca_btl_ofi_context_t *ofi_context;
167169

170+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
168171
ofi_context = get_ofi_context(ofi_btl);
169172

170173
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -188,15 +191,15 @@ int mca_btl_ofi_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_en
188191
fi_datatype, FI_CSWAP, &comp->comp_ctx);
189192

190193
if (rc == -FI_EAGAIN) {
194+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
191195
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
192196
return OPAL_ERR_OUT_OF_RESOURCE;
193197
} else if (rc < 0) {
198+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
194199
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
195200
BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
196201
MCA_BTL_OFI_ABORT();
197202
}
198203

199-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
200-
201204
return OPAL_SUCCESS;
202205
}

opal/mca/btl/ofi/btl_ofi_rdma.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ int mca_btl_ofi_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
6666
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
6767
mca_btl_ofi_context_t *ofi_context;
6868

69+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
70+
6971
ofi_context = get_ofi_context(ofi_btl);
7072

7173
/* create completion context */
@@ -84,17 +86,18 @@ int mca_btl_ofi_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
8486
&comp->comp_ctx); /* completion context */
8587

8688
if (-FI_EAGAIN == rc) {
89+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
8790
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
8891
return OPAL_ERR_OUT_OF_RESOURCE;
8992
}
9093

9194
if (0 != rc) {
95+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9296
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9397
BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc)));
9498
MCA_BTL_OFI_ABORT();
9599
}
96100

97-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
98101

99102
return OPAL_SUCCESS;
100103
}
@@ -111,6 +114,8 @@ int mca_btl_ofi_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
111114
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
112115
mca_btl_ofi_context_t *ofi_context;
113116

117+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
118+
114119
ofi_context = get_ofi_context(ofi_btl);
115120

116121
/* create completion context */
@@ -127,18 +132,18 @@ int mca_btl_ofi_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
127132
&comp->comp_ctx); /* completion context */
128133

129134
if (-FI_EAGAIN == rc) {
135+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
130136
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
131137
return OPAL_ERR_OUT_OF_RESOURCE;
132138
}
133139

134140
if (0 != rc) {
141+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
135142
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
136143
BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc)));
137144
MCA_BTL_OFI_ABORT();
138145
}
139146

140-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
141-
142147
return OPAL_SUCCESS;
143148
}
144149

0 commit comments

Comments
 (0)