15
15
* All rights reserved.
16
16
* Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
17
17
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18
- * Copyright (c) 2014 Research Organization for Information Science
18
+ * Copyright (c) 2014-2017 Research Organization for Information Science
19
19
* and Technology (RIST). All rights reserved.
20
20
* Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
21
21
* Copyright (c) 2017 IBM Corporation. All rights reserved.
@@ -614,6 +614,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
614
614
goto REPORT_ERROR ;
615
615
}
616
616
617
+ /* spin up the spawn threads */
618
+ orte_odls_base_start_threads (jdata );
619
+
617
620
/* to save memory, purge the job map of all procs other than
618
621
* our own - for daemons, this will completely release the
619
622
* proc structures. For the HNP, the proc structs will
@@ -727,9 +730,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
727
730
int rc , i ;
728
731
bool found ;
729
732
orte_proc_state_t state ;
730
- char * * argvptr ;
731
- char * pathenv = NULL , * mpiexec_pathenv = NULL ;
732
- char * full_search ;
733
733
734
734
ORTE_ACQUIRE_OBJECT (cd );
735
735
@@ -772,44 +772,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
772
772
goto errorout ;
773
773
}
774
774
775
- /* Search for the OMPI_exec_path and PATH settings in the environment. */
776
- for (argvptr = app -> env ; * argvptr != NULL ; argvptr ++ ) {
777
- if (0 == strncmp ("OMPI_exec_path=" , * argvptr , 15 )) {
778
- mpiexec_pathenv = * argvptr + 15 ;
779
- }
780
- if (0 == strncmp ("PATH=" , * argvptr , 5 )) {
781
- pathenv = * argvptr + 5 ;
782
- }
783
- }
784
-
785
- /* If OMPI_exec_path is set (meaning --path was used), then create a
786
- temporary environment to be used in the search for the executable.
787
- The PATH setting in this temporary environment is a combination of
788
- the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
789
- then just use existing environment with PATH in it. */
790
- if (NULL != mpiexec_pathenv ) {
791
- argvptr = NULL ;
792
- if (pathenv != NULL ) {
793
- asprintf (& full_search , "%s:%s" , mpiexec_pathenv , pathenv );
794
- } else {
795
- asprintf (& full_search , "%s" , mpiexec_pathenv );
796
- }
797
- opal_setenv ("PATH" , full_search , true, & argvptr );
798
- free (full_search );
799
- } else {
800
- argvptr = app -> env ;
801
- }
802
-
803
- rc = orte_util_check_context_app (app , argvptr );
804
- /* do not ERROR_LOG - it will be reported elsewhere */
805
- if (NULL != mpiexec_pathenv ) {
806
- opal_argv_free (argvptr );
807
- }
808
- if (ORTE_SUCCESS != rc ) {
809
- state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
810
- goto errorout ;
811
- }
812
-
813
775
/* did the user request we display output in xterms? */
814
776
if (NULL != orte_xterm && !ORTE_FLAG_TEST (jobdat , ORTE_JOB_FLAG_DEBUGGER_DAEMON )) {
815
777
opal_list_item_t * nmitem ;
@@ -878,15 +840,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
878
840
cd -> argv [0 ] = param ;
879
841
}
880
842
881
- if ( 5 < opal_output_get_verbosity ( orte_odls_base_framework .framework_output )) {
882
- opal_output ( orte_odls_base_framework . framework_output , "%s odls:launch spawning child %s" ,
883
- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
884
- ORTE_NAME_PRINT (& child -> name ));
843
+ opal_output_verbose ( 5 , orte_odls_base_framework .framework_output ,
844
+ "%s odls:launch spawning child %s" ,
845
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
846
+ ORTE_NAME_PRINT (& child -> name ));
885
847
848
+ if (15 < opal_output_get_verbosity (orte_odls_base_framework .framework_output )) {
886
849
/* dump what is going to be exec'd */
887
- if (7 < opal_output_get_verbosity (orte_odls_base_framework .framework_output )) {
888
- opal_dss .dump (orte_odls_base_framework .framework_output , app , ORTE_APP_CONTEXT );
889
- }
850
+ opal_dss .dump (orte_odls_base_framework .framework_output , app , ORTE_APP_CONTEXT );
890
851
}
891
852
892
853
if (ORTE_SUCCESS != (rc = cd -> fork_local (cd ))) {
@@ -923,6 +884,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
923
884
orte_odls_spawn_caddy_t * cd ;
924
885
opal_event_base_t * evb ;
925
886
char * effective_dir = NULL ;
887
+ char * * argvptr ;
888
+ char * pathenv = NULL , * mpiexec_pathenv = NULL ;
889
+ char * full_search ;
926
890
927
891
ORTE_ACQUIRE_OBJECT (caddy );
928
892
@@ -1105,6 +1069,44 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1105
1069
goto GETOUT ;
1106
1070
}
1107
1071
1072
+ /* Search for the OMPI_exec_path and PATH settings in the environment. */
1073
+ for (argvptr = app -> env ; * argvptr != NULL ; argvptr ++ ) {
1074
+ if (0 == strncmp ("OMPI_exec_path=" , * argvptr , 15 )) {
1075
+ mpiexec_pathenv = * argvptr + 15 ;
1076
+ }
1077
+ if (0 == strncmp ("PATH=" , * argvptr , 5 )) {
1078
+ pathenv = * argvptr + 5 ;
1079
+ }
1080
+ }
1081
+
1082
+ /* If OMPI_exec_path is set (meaning --path was used), then create a
1083
+ temporary environment to be used in the search for the executable.
1084
+ The PATH setting in this temporary environment is a combination of
1085
+ the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
1086
+ then just use existing environment with PATH in it. */
1087
+ if (NULL != mpiexec_pathenv ) {
1088
+ argvptr = NULL ;
1089
+ if (pathenv != NULL ) {
1090
+ asprintf (& full_search , "%s:%s" , mpiexec_pathenv , pathenv );
1091
+ } else {
1092
+ asprintf (& full_search , "%s" , mpiexec_pathenv );
1093
+ }
1094
+ opal_setenv ("PATH" , full_search , true, & argvptr );
1095
+ free (full_search );
1096
+ } else {
1097
+ argvptr = app -> env ;
1098
+ }
1099
+
1100
+ rc = orte_util_check_context_app (app , argvptr );
1101
+ /* do not ERROR_LOG - it will be reported elsewhere */
1102
+ if (NULL != mpiexec_pathenv ) {
1103
+ opal_argv_free (argvptr );
1104
+ }
1105
+ if (ORTE_SUCCESS != rc ) {
1106
+ goto GETOUT ;
1107
+ }
1108
+
1109
+
1108
1110
/* tell all children that they are being launched via ORTE */
1109
1111
opal_setenv (OPAL_MCA_PREFIX "orte_launch" , "1" , true, & app -> env );
1110
1112
@@ -1186,10 +1188,17 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1186
1188
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1187
1189
ORTE_NAME_PRINT (& child -> name )));
1188
1190
1191
+ /* determine the thread that will handle this child */
1192
+ ++ orte_odls_globals .next_base ;
1193
+ if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
1194
+ orte_odls_globals .next_base = 0 ;
1195
+ }
1196
+ evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1197
+
1189
1198
/* set the waitpid callback here for thread protection and
1190
1199
* to ensure we can capture the callback on shortlived apps */
1191
1200
ORTE_FLAG_SET (child , ORTE_PROC_FLAG_ALIVE );
1192
- orte_wait_cb (child , ompi_odls_base_default_wait_local_proc , NULL );
1201
+ orte_wait_cb (child , orte_odls_base_default_wait_local_proc , evb , NULL );
1193
1202
1194
1203
/* dispatch this child to the next available launch thread */
1195
1204
cd = OBJ_NEW (orte_odls_spawn_caddy_t );
@@ -1228,16 +1237,11 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1228
1237
goto GETOUT ;
1229
1238
}
1230
1239
}
1231
- ++ orte_odls_globals .next_base ;
1232
- if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
1233
- orte_odls_globals .next_base = 0 ;
1234
- }
1235
1240
opal_output_verbose (1 , orte_odls_base_framework .framework_output ,
1236
1241
"%s odls:dispatch %s to thread %d" ,
1237
1242
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1238
1243
ORTE_NAME_PRINT (& child -> name ),
1239
1244
orte_odls_globals .next_base );
1240
- evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1241
1245
opal_event_set (evb , & cd -> ev , -1 ,
1242
1246
OPAL_EV_WRITE , orte_odls_base_spawn_proc , cd );
1243
1247
opal_event_set_priority (& cd -> ev , ORTE_MSG_PRI );
@@ -1255,11 +1259,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1255
1259
free (effective_dir );
1256
1260
effective_dir = NULL ;
1257
1261
}
1258
- /* tell the state machine that all local procs for this job
1259
- * were launched so that it can do whatever it needs to do,
1260
- * like send a state update message for all procs to the HNP
1261
- */
1262
- ORTE_ACTIVATE_JOB_STATE (jobdat , ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE );
1263
1262
1264
1263
ERROR_OUT :
1265
1264
/* ensure we reset our working directory back to our default location */
@@ -1323,8 +1322,10 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i
1323
1322
* Wait for a callback indicating the child has completed.
1324
1323
*/
1325
1324
1326
- void ompi_odls_base_default_wait_local_proc ( orte_proc_t * proc , void * cbdata )
1325
+ void orte_odls_base_default_wait_local_proc ( int fd , short sd , void * cbdata )
1327
1326
{
1327
+ orte_wait_tracker_t * t2 = (orte_wait_tracker_t * )cbdata ;
1328
+ orte_proc_t * proc = t2 -> child ;
1328
1329
int i ;
1329
1330
orte_job_t * jobdat ;
1330
1331
orte_proc_state_t state = ORTE_PROC_STATE_WAITPID_FIRED ;
@@ -1528,6 +1529,8 @@ void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1528
1529
/* cancel the wait as this proc has already terminated */
1529
1530
orte_wait_cb_cancel (proc );
1530
1531
ORTE_ACTIVATE_PROC_STATE (& proc -> name , state );
1532
+ /* cleanup the tracker */
1533
+ OBJ_RELEASE (t2 );
1531
1534
}
1532
1535
1533
1536
typedef struct {
@@ -1903,17 +1906,17 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
1903
1906
goto CLEANUP ;
1904
1907
}
1905
1908
}
1906
- orte_wait_cb (child , ompi_odls_base_default_wait_local_proc , NULL );
1907
-
1908
1909
++ orte_odls_globals .next_base ;
1909
1910
if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
1910
1911
orte_odls_globals .next_base = 0 ;
1911
1912
}
1913
+ evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1914
+ orte_wait_cb (child , orte_odls_base_default_wait_local_proc , evb , NULL );
1915
+
1912
1916
OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1913
1917
"%s restarting app %s" ,
1914
1918
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), app -> app ));
1915
1919
1916
- evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1917
1920
opal_event_set (evb , & cd -> ev , -1 ,
1918
1921
OPAL_EV_WRITE , orte_odls_base_spawn_proc , cd );
1919
1922
opal_event_set_priority (& cd -> ev , ORTE_MSG_PRI );
0 commit comments