Skip to content

Commit 8f496b0

Browse files
author
Ralph Castain
committed
Try automatically adding local spawn threads to parallelize the fork/exec process to speed up the launch on large SMPs. Harvest the threads after initial spawn to minimize any impact on running jobs.
Change the determination of #spawn threads to be done on basis of #local procs in first job being spawned. Someone can look at an optimization that handles subsequent dynamic spawns that might be larger in size. Leave the threads running, but blocked, for the life of the daemon, and use them to harvest the local procs as they terminate. This helps short-lived jobs in particular. Add MCA params to set: * max number of spawn threads (default: 4) * set a specific number of spawn threads (default: -1, indicating no set number) * cutoff - minimum number of local procs before using spawn threads (default: 32) Signed-off-by: Ralph Castain <[email protected]>
1 parent 45db363 commit 8f496b0

File tree

12 files changed

+248
-126
lines changed

12 files changed

+248
-126
lines changed

orte/mca/errmgr/default_orted/errmgr_default_orted.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "orte/mca/ess/ess.h"
4747
#include "orte/mca/state/state.h"
4848

49+
#include "orte/runtime/orte_wait.h"
4950
#include "orte/runtime/orte_quit.h"
5051
#include "orte/runtime/orte_globals.h"
5152
#include "orte/runtime/data_type_support/orte_dt_support.h"
@@ -327,6 +328,7 @@ static void proc_errors(int fd, short args, void *cbdata)
327328
orte_plm_cmd_flag_t cmd;
328329
int rc=ORTE_SUCCESS;
329330
int i;
331+
orte_wait_tracker_t *t2;
330332

331333
ORTE_ACQUIRE_OBJECT(caddy);
332334

@@ -412,7 +414,14 @@ static void proc_errors(int fd, short args, void *cbdata)
412414
goto cleanup;
413415
}
414416
/* leave the exit code alone - process this as a waitpid */
415-
ompi_odls_base_default_wait_local_proc(child, NULL);
417+
t2 = OBJ_NEW(orte_wait_tracker_t);
418+
OBJ_RETAIN(child); // protect against race conditions
419+
t2->child = child;
420+
t2->evb = orte_event_base;
421+
opal_event_set(t2->evb, &t2->ev, -1,
422+
OPAL_EV_WRITE, orte_odls_base_default_wait_local_proc, t2);
423+
opal_event_set_priority(&t2->ev, ORTE_MSG_PRI);
424+
opal_event_active(&t2->ev, OPAL_EV_WRITE, 1);
416425
goto cleanup;
417426
}
418427
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,

orte/mca/odls/base/base.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
14+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -44,5 +45,9 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_odls_base_framework;
4445
*/
4546
ORTE_DECLSPEC int orte_odls_base_select(void);
4647

48+
ORTE_DECLSPEC void orte_odls_base_start_threads(orte_job_t *jdata);
49+
50+
ORTE_DECLSPEC void orte_odls_base_harvest_threads(void);
51+
4752
END_C_DECLS
4853
#endif

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 67 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* All rights reserved.
1616
* Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18-
* Copyright (c) 2014 Research Organization for Information Science
18+
* Copyright (c) 2014-2017 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
2020
* Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
2121
* Copyright (c) 2017 IBM Corporation. All rights reserved.
@@ -614,6 +614,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
614614
goto REPORT_ERROR;
615615
}
616616

617+
/* spin up the spawn threads */
618+
orte_odls_base_start_threads(jdata);
619+
617620
/* to save memory, purge the job map of all procs other than
618621
* our own - for daemons, this will completely release the
619622
* proc structures. For the HNP, the proc structs will
@@ -727,9 +730,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
727730
int rc, i;
728731
bool found;
729732
orte_proc_state_t state;
730-
char **argvptr;
731-
char *pathenv = NULL, *mpiexec_pathenv = NULL;
732-
char *full_search;
733733

734734
ORTE_ACQUIRE_OBJECT(cd);
735735

@@ -772,44 +772,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
772772
goto errorout;
773773
}
774774

775-
/* Search for the OMPI_exec_path and PATH settings in the environment. */
776-
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
777-
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
778-
mpiexec_pathenv = *argvptr + 15;
779-
}
780-
if (0 == strncmp("PATH=", *argvptr, 5)) {
781-
pathenv = *argvptr + 5;
782-
}
783-
}
784-
785-
/* If OMPI_exec_path is set (meaning --path was used), then create a
786-
temporary environment to be used in the search for the executable.
787-
The PATH setting in this temporary environment is a combination of
788-
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
789-
then just use existing environment with PATH in it. */
790-
if (NULL != mpiexec_pathenv) {
791-
argvptr = NULL;
792-
if (pathenv != NULL) {
793-
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
794-
} else {
795-
asprintf(&full_search, "%s", mpiexec_pathenv);
796-
}
797-
opal_setenv("PATH", full_search, true, &argvptr);
798-
free(full_search);
799-
} else {
800-
argvptr = app->env;
801-
}
802-
803-
rc = orte_util_check_context_app(app, argvptr);
804-
/* do not ERROR_LOG - it will be reported elsewhere */
805-
if (NULL != mpiexec_pathenv) {
806-
opal_argv_free(argvptr);
807-
}
808-
if (ORTE_SUCCESS != rc) {
809-
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
810-
goto errorout;
811-
}
812-
813775
/* did the user request we display output in xterms? */
814776
if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
815777
opal_list_item_t *nmitem;
@@ -878,15 +840,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
878840
cd->argv[0] = param;
879841
}
880842

881-
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
882-
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
883-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
884-
ORTE_NAME_PRINT(&child->name));
843+
opal_output_verbose(5, orte_odls_base_framework.framework_output,
844+
"%s odls:launch spawning child %s",
845+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
846+
ORTE_NAME_PRINT(&child->name));
885847

848+
if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
886849
/* dump what is going to be exec'd */
887-
if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
888-
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
889-
}
850+
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
890851
}
891852

892853
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
@@ -923,6 +884,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
923884
orte_odls_spawn_caddy_t *cd;
924885
opal_event_base_t *evb;
925886
char *effective_dir = NULL;
887+
char **argvptr;
888+
char *pathenv = NULL, *mpiexec_pathenv = NULL;
889+
char *full_search;
926890

927891
ORTE_ACQUIRE_OBJECT(caddy);
928892

@@ -1105,6 +1069,44 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11051069
goto GETOUT;
11061070
}
11071071

1072+
/* Search for the OMPI_exec_path and PATH settings in the environment. */
1073+
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
1074+
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
1075+
mpiexec_pathenv = *argvptr + 15;
1076+
}
1077+
if (0 == strncmp("PATH=", *argvptr, 5)) {
1078+
pathenv = *argvptr + 5;
1079+
}
1080+
}
1081+
1082+
/* If OMPI_exec_path is set (meaning --path was used), then create a
1083+
temporary environment to be used in the search for the executable.
1084+
The PATH setting in this temporary environment is a combination of
1085+
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
1086+
then just use existing environment with PATH in it. */
1087+
if (NULL != mpiexec_pathenv) {
1088+
argvptr = NULL;
1089+
if (pathenv != NULL) {
1090+
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
1091+
} else {
1092+
asprintf(&full_search, "%s", mpiexec_pathenv);
1093+
}
1094+
opal_setenv("PATH", full_search, true, &argvptr);
1095+
free(full_search);
1096+
} else {
1097+
argvptr = app->env;
1098+
}
1099+
1100+
rc = orte_util_check_context_app(app, argvptr);
1101+
/* do not ERROR_LOG - it will be reported elsewhere */
1102+
if (NULL != mpiexec_pathenv) {
1103+
opal_argv_free(argvptr);
1104+
}
1105+
if (ORTE_SUCCESS != rc) {
1106+
goto GETOUT;
1107+
}
1108+
1109+
11081110
/* tell all children that they are being launched via ORTE */
11091111
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
11101112

@@ -1186,10 +1188,17 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11861188
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11871189
ORTE_NAME_PRINT(&child->name)));
11881190

1191+
/* determine the thread that will handle this child */
1192+
++orte_odls_globals.next_base;
1193+
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1194+
orte_odls_globals.next_base = 0;
1195+
}
1196+
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1197+
11891198
/* set the waitpid callback here for thread protection and
11901199
* to ensure we can capture the callback on shortlived apps */
11911200
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1192-
orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1201+
orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
11931202

11941203
/* dispatch this child to the next available launch thread */
11951204
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
@@ -1228,16 +1237,11 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12281237
goto GETOUT;
12291238
}
12301239
}
1231-
++orte_odls_globals.next_base;
1232-
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1233-
orte_odls_globals.next_base = 0;
1234-
}
12351240
opal_output_verbose(1, orte_odls_base_framework.framework_output,
12361241
"%s odls:dispatch %s to thread %d",
12371242
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
12381243
ORTE_NAME_PRINT(&child->name),
12391244
orte_odls_globals.next_base);
1240-
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
12411245
opal_event_set(evb, &cd->ev, -1,
12421246
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
12431247
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
@@ -1255,11 +1259,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12551259
free(effective_dir);
12561260
effective_dir = NULL;
12571261
}
1258-
/* tell the state machine that all local procs for this job
1259-
* were launched so that it can do whatever it needs to do,
1260-
* like send a state update message for all procs to the HNP
1261-
*/
1262-
ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE);
12631262

12641263
ERROR_OUT:
12651264
/* ensure we reset our working directory back to our default location */
@@ -1323,8 +1322,10 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i
13231322
* Wait for a callback indicating the child has completed.
13241323
*/
13251324

1326-
void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1325+
void orte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
13271326
{
1327+
orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
1328+
orte_proc_t *proc = t2->child;
13281329
int i;
13291330
orte_job_t *jobdat;
13301331
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
@@ -1528,6 +1529,8 @@ void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
15281529
/* cancel the wait as this proc has already terminated */
15291530
orte_wait_cb_cancel(proc);
15301531
ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1532+
/* cleanup the tracker */
1533+
OBJ_RELEASE(t2);
15311534
}
15321535

15331536
typedef struct {
@@ -1903,17 +1906,17 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
19031906
goto CLEANUP;
19041907
}
19051908
}
1906-
orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1907-
19081909
++orte_odls_globals.next_base;
19091910
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
19101911
orte_odls_globals.next_base = 0;
19111912
}
1913+
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1914+
orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
1915+
19121916
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
19131917
"%s restarting app %s",
19141918
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
19151919

1916-
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
19171920
opal_event_set(evb, &cd->ev, -1,
19181921
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
19191922
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);

0 commit comments

Comments
 (0)