Skip to content

Commit b7e45ef

Browse files
authored
Merge pull request #2837 from jjhursey/topic/ibm/v2.x/orted-timeout-improv
orterun: Add parameter to control when we give up on stack traces
2 parents 74440de + b858344 commit b7e45ef

File tree

4 files changed

+31
-8
lines changed

4 files changed

+31
-8
lines changed

orte/runtime/orte_globals.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
1717
* Copyright (c) 2014-2015 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
19+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -129,6 +130,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL;
129130

130131
opal_buffer_t *orte_tree_launch_cmd = NULL;
131132

133+
int orte_stack_trace_wait_timeout = 30;
134+
132135
/* global arrays for data storage */
133136
opal_pointer_array_t *orte_job_data = NULL;
134137
opal_pointer_array_t *orte_node_pool = NULL;

orte/runtime/orte_globals.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1515
* All rights reserved.
1616
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
17+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -587,6 +588,9 @@ ORTE_DECLSPEC extern char *orte_daemon_cores;
587588
/* cutoff for collective modex */
588589
ORTE_DECLSPEC extern uint32_t orte_direct_modex_cutoff;
589590

591+
/* Max time to wait for stack straces to return */
592+
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;
593+
590594
END_C_DECLS
591595

592596
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */

orte/runtime/orte_mca_params.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
1717
* Copyright (c) 2014 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
19+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -743,5 +744,14 @@ int orte_register_params(void)
743744
/* register a synonym for old name */
744745
mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
745746

747+
748+
/* Amount of time to wait for a stack trace to return from the daemons */
749+
orte_stack_trace_wait_timeout = 30;
750+
(void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace",
751+
"Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)",
752+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
753+
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
754+
&orte_stack_trace_wait_timeout);
755+
746756
return ORTE_SUCCESS;
747757
}

orte/tools/orterun/orterun.c

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2860,8 +2860,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
28602860
}
28612861
++ntraces;
28622862
if (orte_process_info.num_procs == ntraces) {
2863-
/* cancel the timeout */
2864-
OBJ_DESTRUCT(&stack_trace_timer);
2863+
if( orte_stack_trace_wait_timeout > 0 ) {
2864+
/* cancel the timeout */
2865+
OBJ_DESTRUCT(&stack_trace_timer);
2866+
}
28652867
/* abort the job */
28662868
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
28672869
/* set the global abnormal exit flag */
@@ -2871,6 +2873,8 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
28712873

28722874
static void stack_trace_timeout(int sd, short args, void *cbdata)
28732875
{
2876+
fprintf(stderr, "Timed out waiting for stack traces. Job will now terminate. orte_stack_trace_wait_timeout = %d\n", orte_stack_trace_wait_timeout);
2877+
28742878
/* abort the job */
28752879
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
28762880
/* set the global abnormal exit flag */
@@ -2963,12 +2967,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
29632967
OBJ_RELEASE(sig);
29642968
/* we will terminate after we get the stack_traces, but set a timeout
29652969
* just in case we never hear back from everyone */
2966-
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
2967-
opal_event_evtimer_set(orte_event_base,
2968-
stack_trace_timer.ev, stack_trace_timeout, NULL);
2969-
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
2970-
stack_trace_timer.tv.tv_sec = 30;
2971-
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
2970+
if( orte_stack_trace_wait_timeout > 0 ) {
2971+
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
2972+
opal_event_evtimer_set(orte_event_base,
2973+
stack_trace_timer.ev, stack_trace_timeout, NULL);
2974+
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
2975+
stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
2976+
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
2977+
}
29722978
return;
29732979
}
29742980
giveup:

0 commit comments

Comments
 (0)