Skip to content

Commit 7c72657

Browse files
jsquyresrhc54
authored andcommitted
pmix_fd: cap the max FD to try to close
On some OS's (e.g., macOS), the value returned by sysconf(_SC_OPEN_MAX) can be set by the user via "ulimit -n X", where X can be -1 (unlimited) or a positive integer. On macOS in particular, if the user does not set this value, it's unclear how the default value is chosen. Some users have reported seeing arbitrarily large default values (in the billions), resulting in a very large loop over close() that can take minutes/hours to complete, leading the user to think that the app has hung. To avoid this, ensure that we cap the max FD that we'll try to close. This is not a perfect scheme, and there's uncertainty on how the macOS default value works, so we provide the pmix_maxfd MCA var to allow the user to set the max FD value if needed. This commit is inspired by #10360 and #10358. Thanks to Scott Sayres for raising the issue. Signed-off-by: Jeff Squyres <[email protected]>
1 parent 594303d commit 7c72657

File tree

3 files changed

+27
-3
lines changed

3 files changed

+27
-3
lines changed

src/runtime/pmix_params.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
15+
* Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved.
1616
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
1717
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
1818
* All rights reserved.
@@ -51,6 +51,7 @@ int pmix_event_caching_window = 1;
5151
bool pmix_suppress_missing_data_warning = false;
5252
char *pmix_progress_thread_cpus = NULL;
5353
bool pmix_bind_progress_thread_reqd = false;
54+
int pmix_maxfd = 1024;
5455

5556
pmix_status_t pmix_register_params(void)
5657
{
@@ -269,6 +270,11 @@ pmix_status_t pmix_register_params(void)
269270
PMIX_MCA_BASE_VAR_TYPE_BOOL,
270271
&pmix_bind_progress_thread_reqd);
271272

273+
(void) pmix_mca_base_var_register("pmix", "pmix", NULL, "maxfd",
274+
"In non-Linux environments, use this value as a maximum number of file descriptors to close when forking a new child process",
275+
PMIX_MCA_BASE_VAR_TYPE_INT,
276+
&pmix_maxfd);
277+
272278
pmix_hwloc_register();
273279
return PMIX_SUCCESS;
274280
}

src/runtime/pmix_rte.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
13-
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2010-2022 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2021-2022 Nanook Consulting All rights reserved.
1616
* $COPYRIGHT$
@@ -51,6 +51,7 @@ PMIX_EXPORT extern int pmix_event_caching_window;
5151
PMIX_EXPORT extern bool pmix_suppress_missing_data_warning;
5252
PMIX_EXPORT extern char *pmix_progress_thread_cpus;
5353
PMIX_EXPORT extern bool pmix_bind_progress_thread_reqd;
54+
PMIX_EXPORT extern int pmix_maxfd;
5455

5556
/** version string of pmix */
5657
extern const char pmix_version_string[];

src/util/pmix_fd.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved.
33
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
44
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
55
* Copyright (c) 2015 Research Organization for Information Science
@@ -52,6 +52,7 @@
5252
#include "src/util/pmix_error.h"
5353
#include "src/util/pmix_fd.h"
5454
#include "src/util/pmix_string_copy.h"
55+
#include "src/runtime/pmix_rte.h"
5556

5657
/*
5758
* Simple loop over reading from a fd
@@ -236,6 +237,22 @@ void pmix_close_open_file_descriptors(int protected_fd)
236237
if (0 > fdmax) {
237238
fdmax = sysconf(_SC_OPEN_MAX);
238239
}
240+
// On some OS's (e.g., macOS), the value returned by
241+
// sysconf(_SC_OPEN_MAX) can be set by the user via "ulimit -n X",
242+
// where X can be -1 (unlimited) or a positive integer. On macOS
243+
// in particular, if the user does not set this value, it's
244+
// unclear how the default value is chosen. Some users have
245+
// reported seeing arbitrarily large default values (in the
246+
// billions), resulting in a very large loop over close() that can
247+
// take minutes/hours to complete, leading the user to think that
248+
// the app has hung. To avoid this, ensure that we cap the max FD
249+
// that we'll try to close. This is not a perfect scheme, and
250+
// there's uncertainty on how the macOS default value works, so we
251+
// provide the pmix_maxfd MCA var to allow the user to set the max
252+
// FD value if needed.
253+
if (-1 == fdmax || pmix_maxfd < fdmax) {
254+
fdmax = pmix_maxfd;
255+
}
239256
for (int fd = 3; fd < fdmax; fd++) {
240257
if (fd != protected_fd) {
241258
close(fd);

0 commit comments

Comments
 (0)