Skip to content

Commit 9d98905

Browse files
authored
Merge branch 'main' into squelch_allreduce_log
2 parents 6641472 + 6e15058 commit 9d98905

File tree

6 files changed

+188
-26
lines changed

6 files changed

+188
-26
lines changed

docs/installing-open-mpi/required-support-libraries.rst

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@ system.
2323

2424
* This library is required; Open MPI will not build without it.
2525
* **Minimum version required:** |hwloc_min_version|
26+
27+
.. note:: While the minimum version of Hwloc will *work*, it may
28+
still have bugs and/or have less functionality as
29+
compared to later versions.
30+
31+
Other than the Hwloc restriction about v3.0.0 and beyond
32+
(see below), the Open MPI community generally recomends
33+
using the latest available version of Hwloc unless there
34+
is a specific reason not to.
35+
2636
* **Version embedded in Open MPI distribution:**
2737
|hwloc_embedded_version|
2838

@@ -62,11 +72,32 @@ system.
6272
* **Version embedded in Open MPI distribution:**
6373
|event_embedded_version|
6474

65-
* `PMIx <https://pmix.org/>`_
75+
.. note:: The Open MPI community has heavily tested Libevent
76+
|event_embedded_version|. Other versions should *work*,
77+
but |mdash| unlike Hwloc, OpenPMIx, and PRRTE |mdash|
78+
there is not much reason to upgrade to use a later
79+
version of Libevent.
80+
81+
* `OpenPMIx <https://docs.openpmix.org/>`_
6682

6783
* This library is required; Open MPI will not build without it.
6884
* **Minimum version required when building without PRRTE:**
6985
|pmix_min_version|
86+
87+
.. note:: While the minimum version of OpenPMIx will *work*, it
88+
may still have bugs and/or have less functionality as
89+
compared to later versions.
90+
91+
The Open MPI community generally recomends using the
92+
latest available version of OpenPMIx unless there is a
93+
specific reason not to.
94+
95+
.. note:: While `OpenPMIx <https://docs.openpmix.org/>`_ is the
96+
formal name of the software that implements the `PMIx
97+
<https://pmix.org/>`_ standard, the term "PMIx" is used
98+
extensively throughout this documentation to refer to
99+
the OpenPMIx software package.
100+
70101
* **Minimum version required when building with PRRTE:** `See the
71102
PRRTE project documentation <https://docs.prrte.org/>`_.
72103
* **Version embedded in Open MPI distribution:**
@@ -77,10 +108,15 @@ system.
77108
* This library is optional in some environments. See below.
78109
* **Minimum version required:** |prte_min_version|
79110

80-
.. note:: While building Open MPI with PRRTE |prte_min_version|
81-
*works*, you will not get a fully-populated
82-
``mpirun(1)`` man page. The Open MPI community
83-
recommends that you use PRRTE version 3.0.1 or higher.
111+
.. note:: While the minimum version of PRRTE will *work*, it may
112+
still have bugs and/or have less functionality as
113+
compared to later versions. For example, if you build
114+
and run with |prte_min_version|, you will not get a
115+
fully-populated ``mpirun(1)`` man page.
116+
117+
The Open MPI community generally recomends using the
118+
latest available version of PRRTE unless there is a
119+
specific reason not to.
84120

85121
* **Version embedded in Open MPI distribution:**
86122
|prte_embedded_version|

ompi/mca/fs/lustre/fs_lustre_component.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ int mca_fs_lustre_priority = 20;
4747
int mca_fs_lustre_stripe_size = 0;
4848
int mca_fs_lustre_stripe_width = 0;
4949
int mca_fs_lustre_lock_algorithm = 0; /* auto */
50+
51+
static const mca_base_var_enum_value_t ompi_fs_lustre_lock_algorithm_modes[] = {
52+
{.value = 0, .string = "auto"},
53+
{.value = 1, .string = "skip locking"},
54+
{.value = 2, .string = "always lock entire file"},
55+
{.value = 3, .string = "lock specific ranges"},
56+
{.string = NULL},
57+
};
58+
5059
/*
5160
* Instantiate the public struct with all of our public information
5261
* and pointers to our public functions in it
@@ -77,6 +86,8 @@ mca_fs_base_component_2_0_0_t mca_fs_lustre_component = {
7786
static int
7887
lustre_register(void)
7988
{
89+
mca_base_var_enum_t *new_enum;
90+
8091
mca_fs_lustre_priority = 20;
8192
(void) mca_base_component_var_register(&mca_fs_lustre_component.fsm_version,
8293
"priority", "Priority of the lustre fs component",
@@ -95,15 +106,18 @@ lustre_register(void)
95106
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
96107
OPAL_INFO_LVL_9,
97108
MCA_BASE_VAR_SCOPE_READONLY, &mca_fs_lustre_stripe_width);
109+
110+
(void) mca_base_var_enum_create("mca_fs_lustre_lock_algorithm", ompi_fs_lustre_lock_algorithm_modes, &new_enum);
111+
98112
mca_fs_lustre_lock_algorithm = 0;
99113
(void) mca_base_component_var_register(&mca_fs_lustre_component.fsm_version,
100-
"lock_algorithm", "Locking algorithm used by the fs ufs component. "
101-
" 0: auto (default), 1: skip locking, 2: always lock entire file, "
102-
"3: lock only specific ranges",
103-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
114+
"lock_algorithm", "Locking algorithm used by the fs lustre component. "
115+
"(default: auto)",
116+
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
104117
OPAL_INFO_LVL_9,
105118
MCA_BASE_VAR_SCOPE_READONLY,
106-
&mca_fs_lustre_lock_algorithm );
119+
&mca_fs_lustre_lock_algorithm);
120+
OBJ_RELEASE(new_enum);
107121

108122
return OMPI_SUCCESS;
109123
}

ompi/mca/fs/ufs/fs_ufs_component.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@
3232

3333
int mca_fs_ufs_priority = 10;
3434
int mca_fs_ufs_lock_algorithm=0; /* auto */
35+
36+
static const mca_base_var_enum_value_t ompi_fs_ufs_lock_algorithm_modes[] = {
37+
{.value = 0, .string = "auto"},
38+
{.value = 1, .string = "skip locking"},
39+
{.value = 2, .string = "always lock entire file"},
40+
{.value = 3, .string = "lock specific ranges"},
41+
{.string = NULL},
42+
};
43+
3544
/*
3645
* Private functions
3746
*/
@@ -73,6 +82,8 @@ mca_fs_base_component_2_0_0_t mca_fs_ufs_component = {
7382

7483
static int register_component(void)
7584
{
85+
mca_base_var_enum_t *new_enum;
86+
7687
mca_fs_ufs_priority = 10;
7788
(void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version,
7889
"priority", "Priority of the fs ufs component",
@@ -81,15 +92,17 @@ static int register_component(void)
8192
MCA_BASE_VAR_SCOPE_READONLY,
8293
&mca_fs_ufs_priority);
8394

95+
(void) mca_base_var_enum_create("mca_fs_ufs_lock_algorithm", ompi_fs_ufs_lock_algorithm_modes, &new_enum);
96+
8497
mca_fs_ufs_lock_algorithm = 0;
8598
(void) mca_base_component_var_register(&mca_fs_ufs_component.fsm_version,
8699
"lock_algorithm", "Locking algorithm used by the fs ufs component. "
87-
" 0: auto (default), 1: skip locking, 2: always lock entire file, "
88-
"3: lock only specific ranges",
89-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
100+
"(default: auto)",
101+
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
90102
OPAL_INFO_LVL_9,
91103
MCA_BASE_VAR_SCOPE_READONLY,
92-
&mca_fs_ufs_lock_algorithm );
104+
&mca_fs_ufs_lock_algorithm);
105+
OBJ_RELEASE(new_enum);
93106

94107
return OMPI_SUCCESS;
95108
}

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 107 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,96 @@ static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
217217
return 0;
218218
}
219219

220+
static int accelerator_cuda_check_mpool(CUdeviceptr dbuf, CUmemorytype *mem_type,
221+
int *dev_id)
222+
{
223+
#if OPAL_CUDA_VMM_SUPPORT
224+
static int device_count = -1;
225+
static int mpool_supported = -1;
226+
CUresult result;
227+
CUmemoryPool mpool;
228+
CUmemAccess_flags flags;
229+
CUmemLocation location;
230+
231+
if (mpool_supported <= 0) {
232+
if (mpool_supported == -1) {
233+
if (device_count == -1) {
234+
result = cuDeviceGetCount(&device_count);
235+
if (result != CUDA_SUCCESS || (0 == device_count)) {
236+
mpool_supported = 0; /* never check again */
237+
device_count = 0;
238+
return 0;
239+
}
240+
}
241+
242+
/* assume uniformity of devices */
243+
result = cuDeviceGetAttribute(&mpool_supported,
244+
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, 0);
245+
if (result != CUDA_SUCCESS) {
246+
mpool_supported = 0;
247+
}
248+
}
249+
if (0 == mpool_supported) {
250+
return 0;
251+
}
252+
}
253+
254+
result = cuPointerGetAttribute(&mpool, CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
255+
dbuf);
256+
if (CUDA_SUCCESS != result) {
257+
return 0;
258+
}
259+
260+
/* check if device has access */
261+
for (int i = 0; i < device_count; i++) {
262+
location.type = CU_MEM_LOCATION_TYPE_DEVICE;
263+
location.id = i;
264+
result = cuMemPoolGetAccess(&flags, mpool, &location);
265+
if ((CUDA_SUCCESS == result) &&
266+
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
267+
*mem_type = CU_MEMORYTYPE_DEVICE;
268+
*dev_id = i;
269+
return 1;
270+
}
271+
}
272+
273+
/* host must have access as device access possibility is exhausted */
274+
*mem_type = CU_MEMORYTYPE_HOST;
275+
*dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
276+
return 0;
277+
#endif
278+
279+
return 0;
280+
}
281+
282+
static int accelerator_cuda_get_primary_context(CUdevice dev_id, CUcontext *pctx)
283+
{
284+
CUresult result;
285+
unsigned int flags;
286+
int active;
287+
288+
result = cuDevicePrimaryCtxGetState(dev_id, &flags, &active);
289+
if (CUDA_SUCCESS != result) {
290+
return OPAL_ERROR;
291+
}
292+
293+
if (active) {
294+
result = cuDevicePrimaryCtxRetain(pctx, dev_id);
295+
return OPAL_SUCCESS;
296+
}
297+
298+
return OPAL_ERROR;
299+
}
300+
220301
static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
221302
{
222303
CUresult result;
223304
int is_vmm = 0;
305+
int is_mpool_ptr = 0;
224306
int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
307+
int mpool_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
225308
CUmemorytype vmm_mem_type = 0;
309+
CUmemorytype mpool_mem_type = 0;
226310
CUmemorytype mem_type = 0;
227311
CUdeviceptr dbuf = (CUdeviceptr) addr;
228312
CUcontext ctx = NULL, mem_ctx = NULL;
@@ -235,6 +319,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
235319
*flags = 0;
236320

237321
is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
322+
is_mpool_ptr = accelerator_cuda_check_mpool(dbuf, &mpool_mem_type, &mpool_dev_id);
238323

239324
#if OPAL_CUDA_GET_ATTRIBUTES
240325
uint32_t is_managed = 0;
@@ -268,6 +353,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
268353
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
269354
mem_type = CU_MEMORYTYPE_DEVICE;
270355
*dev_id = vmm_dev_id;
356+
} else if (is_mpool_ptr && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
357+
mem_type = CU_MEMORYTYPE_DEVICE;
358+
*dev_id = mpool_dev_id;
271359
} else {
272360
/* Host memory, nothing to do here */
273361
return 0;
@@ -278,6 +366,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
278366
} else {
279367
if (is_vmm) {
280368
*dev_id = vmm_dev_id;
369+
} else if (is_mpool_ptr) {
370+
*dev_id = mpool_dev_id;
281371
} else {
282372
/* query the device from the context */
283373
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
@@ -296,13 +386,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
296386
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
297387
mem_type = CU_MEMORYTYPE_DEVICE;
298388
*dev_id = vmm_dev_id;
389+
} else if (is_mpool_ptr && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
390+
mem_type = CU_MEMORYTYPE_DEVICE;
391+
*dev_id = mpool_dev_id;
299392
} else {
300393
/* Host memory, nothing to do here */
301394
return 0;
302395
}
303396
} else {
304397
if (is_vmm) {
305398
*dev_id = vmm_dev_id;
399+
} else if (is_mpool_ptr) {
400+
*dev_id = mpool_dev_id;
306401
} else {
307402
result = cuPointerGetAttribute(&mem_ctx,
308403
CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
@@ -336,14 +431,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
336431
return OPAL_ERROR;
337432
}
338433
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
339-
if (is_vmm) {
340-
/* This function is expected to set context if pointer is device
341-
* accessible but VMM allocations have NULL context associated
342-
* which cannot be set against the calling thread */
343-
opal_output(0,
344-
"CUDA: unable to set context with the given pointer"
345-
"ptr=%p aborting...", addr);
346-
return OPAL_ERROR;
434+
if (is_vmm || is_mpool_ptr) {
435+
if (OPAL_SUCCESS ==
436+
accelerator_cuda_get_primary_context(
437+
is_vmm ? vmm_dev_id : mpool_dev_id, &mem_ctx)) {
438+
/* As VMM/mempool allocations have no context associated
439+
* with them, check if device primary context can be set */
440+
} else {
441+
opal_output(0,
442+
"CUDA: unable to set ctx with the given pointer"
443+
"ptr=%p aborting...", addr);
444+
return OPAL_ERROR;
445+
}
347446
}
348447

349448
result = cuCtxSetCurrent(mem_ctx);

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
1818
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
1919
* reserved.
20-
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
20+
* Copyright (c) 2019-2024 Google, LLC. All rights reserved.
2121
* Copyright (c) 2019 Intel, Inc. All rights reserved.
2222
* Copyright (c) 2022 IBM Corporation. All rights reserved.
2323
* $COPYRIGHT$
@@ -48,13 +48,13 @@ static int mca_btl_uct_component_register(void)
4848
{
4949
mca_btl_uct_module_t *module = &mca_btl_uct_module_template;
5050

51-
mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0";
51+
mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4";
5252
(void) mca_base_component_var_register(
5353
&mca_btl_uct_component.super.btl_version, "memory_domains",
5454
"Comma-delimited list of memory domains of the form "
5555
"to use for communication. Memory domains MUST provide transports that "
5656
"support put, get, and amos. Special values: all (all available), none."
57-
" (default: mlx5_0,mlx4_0)",
57+
" (default: mlx5_0,mlx4_0,rocep0s4)",
5858
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
5959
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.memory_domains);
6060

0 commit comments

Comments
 (0)