Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 94ccd92

Browse files
authored
Merge pull request #1317 from jjhursey/topic/timeout
Add new CLI options: timeout, get-stack-traces, report-state-on-timeout
2 parents 5c8e2a9 + 2689db8 commit 94ccd92

File tree

6 files changed

+278
-14
lines changed

6 files changed

+278
-14
lines changed

orte/mca/odls/odls_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
1111
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
12-
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
1515
* Copyright (c) 2014 Intel, Inc. All rights reserved.
@@ -78,6 +78,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
7878
#define ORTE_DAEMON_NEW_COLL_ID (orte_daemon_cmd_flag_t) 29
7979

8080

81+
/* for debug purposes, get stack traces from all application procs */
82+
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
83+
8184
/*
8285
* Struct written up the pipe from the child to the parent.
8386
*/

orte/mca/rml/rml_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ BEGIN_C_DECLS
165165
/* error notifications */
166166
#define ORTE_RML_TAG_NOTIFICATION 59
167167

168+
/* stacktrace for debug */
169+
#define ORTE_RML_TAG_STACK_TRACE 60
170+
168171
#define ORTE_RML_TAG_MAX 100
169172

170173
#define ORTE_RML_TAG_NTOH(t) ntohl(t)

orte/orted/orted_comm.c

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
@@ -47,6 +47,7 @@
4747
#include "opal/mca/base/base.h"
4848
#include "opal/util/output.h"
4949
#include "opal/util/opal_environ.h"
50+
#include "opal/util/path.h"
5051
#include "opal/runtime/opal.h"
5152
#include "opal/runtime/opal_progress.h"
5253
#include "opal/dss/dss.h"
@@ -113,6 +114,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
113114
bool found = false;
114115
orte_node_t *node;
115116
orte_grpcomm_signature_t *sig;
117+
FILE *fp;
118+
char gscmd[256], path[1035], *pathptr;
119+
char string[256], *string_ptr = string;
116120

117121
/* unpack the command */
118122
n = 1;
@@ -1137,6 +1141,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
11371141
}
11381142
break;
11391143

1144+
case ORTE_DAEMON_GET_STACK_TRACES:
1145+
/* prep the response */
1146+
answer = OBJ_NEW(opal_buffer_t);
1147+
pathptr = path;
1148+
1149+
// Try to find the "gstack" executable. Failure to find the
1150+
// executable will be handled below, because the receiver
1151+
// expects to have the process name, hostname, and PID in the
1152+
// buffer before finding an error message.
1153+
char *gstack_exec;
1154+
gstack_exec = opal_find_absolute_path("gstack");
1155+
1156+
/* hit each local process with a gstack command */
1157+
for (i=0; i < orte_local_children->size; i++) {
1158+
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1159+
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1160+
relay_msg = OBJ_NEW(opal_buffer_t);
1161+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
1162+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
1163+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
1164+
OBJ_RELEASE(relay_msg);
1165+
break;
1166+
}
1167+
1168+
// If we were able to find the gstack executable,
1169+
// above, then run the command here.
1170+
fp = NULL;
1171+
if (NULL != gstack_exec) {
1172+
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
1173+
gstack_exec, (unsigned long) proct->pid);
1174+
fp = popen(gscmd, "r");
1175+
}
1176+
1177+
// If either we weren't able to find or run the gstack
1178+
// exectuable, send back a nice error message here.
1179+
if (NULL == gstack_exec || NULL == fp) {
1180+
(void) snprintf(string, sizeof(string),
1181+
"Failed to %s \"%s\" on %s to obtain stack traces",
1182+
(NULL == gstack_exec) ? "find" : "run",
1183+
(NULL == gstack_exec) ? "gstack" : gstack_exec,
1184+
proct->node->name);
1185+
if (OPAL_SUCCESS ==
1186+
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
1187+
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
1188+
}
1189+
OBJ_RELEASE(relay_msg);
1190+
break;
1191+
}
1192+
/* Read the output a line at a time and pack it for transmission */
1193+
memset(path, 0, sizeof(path));
1194+
while (fgets(path, sizeof(path)-1, fp) != NULL) {
1195+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
1196+
OBJ_RELEASE(relay_msg);
1197+
break;
1198+
}
1199+
memset(path, 0, sizeof(path));
1200+
}
1201+
/* close */
1202+
pclose(fp);
1203+
/* transfer this load */
1204+
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
1205+
OBJ_RELEASE(relay_msg);
1206+
break;
1207+
}
1208+
OBJ_RELEASE(relay_msg);
1209+
}
1210+
}
1211+
/* always send our response */
1212+
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1213+
ORTE_RML_TAG_STACK_TRACE,
1214+
orte_rml_send_callback, NULL))) {
1215+
ORTE_ERROR_LOG(ret);
1216+
OBJ_RELEASE(answer);
1217+
}
1218+
break;
1219+
11401220
default:
11411221
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
11421222
}
@@ -1201,6 +1281,9 @@ static char *get_orted_comm_cmd_str(int command)
12011281
case ORTE_DAEMON_NEW_COLL_ID:
12021282
return strdup("ORTE_DAEMON_NEW_COLL_ID");
12031283

1284+
case ORTE_DAEMON_GET_STACK_TRACES:
1285+
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
1286+
12041287
default:
12051288
return strdup("Unknown Command!");
12061289
}

orte/tools/orterun/help-orterun.txt

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
1414
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
1515
# $COPYRIGHT$
1616
#
@@ -644,11 +644,11 @@ Please correct this value and try again.
644644
The user-provided time limit for job execution has been
645645
reached:
646646

647-
MPIEXEC_TIMEOUT: %s seconds
647+
MPIEXEC_TIMEOUT: %d seconds
648648

649-
The job will now be aborted. Please check your code and/or
650-
adjust/remove the job execution time limit (as specified
651-
by MPIEXEC_TIMEOUT in your environment).
649+
The job will now be aborted. Please check your code and/or
650+
adjust/remove the job execution time limit (as specified by
651+
MPIEXEC_TIMEOUT in your environment or --timeout on the command line).
652652
#
653653
[orterun:conflict-env-set]
654654
ERROR: You have attempted to pass environment variables to Open MPI
@@ -666,3 +666,12 @@ be restored in a future version of Open MPI.
666666

667667
Please see https://github.com/open-mpi/ompi/issues/1225 for details.
668668

669+
#
670+
[orterun:timeoutconflict]
671+
Conflicting requests for timeout were given:
672+
673+
--timeout command line option: %d
674+
MPIEXEC_TIMEOUT envar: %s
675+
676+
Only one method should be provided, or else they must agree. Please
677+
correct and retry.

0 commit comments

Comments
 (0)