Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit ebe159a

Browse files
author
Ralph Castain
committed
Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests
If requested, obtain stacktraces for each application process and report it to stderr upon timeout stack traces: minor improvements - Also include the hostname and PID of the each process for which we're sending the stack traces (vs. just including the ORTE process name) - Send a specific error message if we couldn't find "gstack" in the $PATH (e.g., on OS X) - Send a sepcific error message if gstack fails to run - Print a message that obtaining the stack traces may take a few seconds so that users don't wonder what's happening Signed-off-by: Jeff Squyres <[email protected]> help-orterun.txt: minor tweaks Trivial update: show "--timeout" (instead of "-timeout") in the help message, just to encourage the use of double-dash options. Signed-off-by: Jeff Squyres <[email protected]> trivial: stacktrace -> stack trace Trivial word smything. Signed-off-by: Jeff Squyres <[email protected]>
1 parent d25b846 commit ebe159a

File tree

7 files changed

+278
-15
lines changed

7 files changed

+278
-15
lines changed

orte/mca/odls/odls_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
1111
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
12-
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
1515
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
@@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
8080
/* add procs for the DVM */
8181
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30
8282

83+
/* for debug purposes, get stack traces from all application procs */
84+
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
85+
8386
/*
8487
* Struct written up the pipe from the child to the parent.
8588
*/

orte/mca/rml/rml_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ BEGIN_C_DECLS
163163
/* error notifications */
164164
#define ORTE_RML_TAG_NOTIFICATION 59
165165

166+
/* stacktrace for debug */
167+
#define ORTE_RML_TAG_STACK_TRACE 60
168+
166169
#define ORTE_RML_TAG_MAX 100
167170

168171
#define ORTE_RML_TAG_NTOH(t) ntohl(t)

orte/mca/schizo/ompi/schizo_ompi.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14-
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2015 Research Organization for Information Science
@@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
9292
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
9393
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
9494

95+
/* testing options */
96+
{ NULL, '\0', "timeout", "timeout", 1,
97+
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
98+
"Timeout the job after the specified number of seconds" },
99+
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
100+
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
101+
"Report all job and process states upon timeout" },
102+
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
103+
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
104+
"Get stack traces of all application procs on timeout" },
105+
106+
95107
/* exit status reporting */
96108
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
97109
NULL, OPAL_CMD_LINE_TYPE_BOOL,

orte/orted/orted_comm.c

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
@@ -47,6 +47,7 @@
4747
#include "opal/mca/base/base.h"
4848
#include "opal/util/output.h"
4949
#include "opal/util/opal_environ.h"
50+
#include "opal/util/path.h"
5051
#include "opal/runtime/opal.h"
5152
#include "opal/runtime/opal_progress.h"
5253
#include "opal/dss/dss.h"
@@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
111112
bool found = false;
112113
orte_node_t *node;
113114
orte_grpcomm_signature_t *sig;
115+
FILE *fp;
116+
char gscmd[256], path[1035], *pathptr;
117+
char string[256], *string_ptr = string;
114118

115119
/* unpack the command */
116120
n = 1;
@@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
10711075
}
10721076
break;
10731077

1078+
case ORTE_DAEMON_GET_STACK_TRACES:
1079+
/* prep the response */
1080+
answer = OBJ_NEW(opal_buffer_t);
1081+
pathptr = path;
1082+
1083+
// Try to find the "gstack" executable. Failure to find the
1084+
// executable will be handled below, because the receiver
1085+
// expects to have the process name, hostname, and PID in the
1086+
// buffer before finding an error message.
1087+
char *gstack_exec;
1088+
gstack_exec = opal_find_absolute_path("gstack");
1089+
1090+
/* hit each local process with a gstack command */
1091+
for (i=0; i < orte_local_children->size; i++) {
1092+
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1093+
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1094+
relay_msg = OBJ_NEW(opal_buffer_t);
1095+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
1096+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
1097+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
1098+
OBJ_RELEASE(relay_msg);
1099+
break;
1100+
}
1101+
1102+
// If we were able to find the gstack executable,
1103+
// above, then run the command here.
1104+
fp = NULL;
1105+
if (NULL != gstack_exec) {
1106+
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
1107+
gstack_exec, (unsigned long) proct->pid);
1108+
fp = popen(gscmd, "r");
1109+
}
1110+
1111+
// If either we weren't able to find or run the gstack
1112+
// exectuable, send back a nice error message here.
1113+
if (NULL == gstack_exec || NULL == fp) {
1114+
(void) snprintf(string, sizeof(string),
1115+
"Failed to %s \"%s\" on %s to obtain stack traces",
1116+
(NULL == gstack_exec) ? "find" : "run",
1117+
(NULL == gstack_exec) ? "gstack" : gstack_exec,
1118+
proct->node->name);
1119+
if (OPAL_SUCCESS ==
1120+
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
1121+
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
1122+
}
1123+
OBJ_RELEASE(relay_msg);
1124+
break;
1125+
}
1126+
/* Read the output a line at a time and pack it for transmission */
1127+
memset(path, 0, sizeof(path));
1128+
while (fgets(path, sizeof(path)-1, fp) != NULL) {
1129+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
1130+
OBJ_RELEASE(relay_msg);
1131+
break;
1132+
}
1133+
memset(path, 0, sizeof(path));
1134+
}
1135+
/* close */
1136+
pclose(fp);
1137+
/* transfer this load */
1138+
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
1139+
OBJ_RELEASE(relay_msg);
1140+
break;
1141+
}
1142+
OBJ_RELEASE(relay_msg);
1143+
}
1144+
}
1145+
/* always send our response */
1146+
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1147+
ORTE_RML_TAG_STACK_TRACE,
1148+
orte_rml_send_callback, NULL))) {
1149+
ORTE_ERROR_LOG(ret);
1150+
OBJ_RELEASE(answer);
1151+
}
1152+
break;
1153+
10741154
default:
10751155
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
10761156
}
@@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
11391219
case ORTE_DAEMON_DVM_ADD_PROCS:
11401220
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
11411221

1222+
case ORTE_DAEMON_GET_STACK_TRACES:
1223+
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
1224+
11421225
default:
11431226
return strdup("Unknown Command!");
11441227
}

0 commit comments

Comments
 (0)