Skip to content

Commit a93c01d

Browse files
author
rhc54
committed
Merge pull request #1724 from rhc54/topic/timeout
Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests
2 parents 59f4a76 + ebe159a commit a93c01d

File tree

7 files changed

+278
-15
lines changed

7 files changed

+278
-15
lines changed

orte/mca/odls/odls_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
1111
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
12-
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
1515
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
@@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
8080
/* add procs for the DVM */
8181
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30
8282

83+
/* for debug purposes, get stack traces from all application procs */
84+
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
85+
8386
/*
8487
* Struct written up the pipe from the child to the parent.
8588
*/

orte/mca/rml/rml_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ BEGIN_C_DECLS
163163
/* error notifications */
164164
#define ORTE_RML_TAG_NOTIFICATION 59
165165

166+
/* stacktrace for debug */
167+
#define ORTE_RML_TAG_STACK_TRACE 60
168+
166169
#define ORTE_RML_TAG_MAX 100
167170

168171
#define ORTE_RML_TAG_NTOH(t) ntohl(t)

orte/mca/schizo/ompi/schizo_ompi.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14-
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2015 Research Organization for Information Science
@@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
9292
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
9393
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
9494

95+
/* testing options */
96+
{ NULL, '\0', "timeout", "timeout", 1,
97+
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
98+
"Timeout the job after the specified number of seconds" },
99+
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
100+
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
101+
"Report all job and process states upon timeout" },
102+
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
103+
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
104+
"Get stack traces of all application procs on timeout" },
105+
106+
95107
/* exit status reporting */
96108
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
97109
NULL, OPAL_CMD_LINE_TYPE_BOOL,

orte/orted/orted_comm.c

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
@@ -47,6 +47,7 @@
4747
#include "opal/mca/base/base.h"
4848
#include "opal/util/output.h"
4949
#include "opal/util/opal_environ.h"
50+
#include "opal/util/path.h"
5051
#include "opal/runtime/opal.h"
5152
#include "opal/runtime/opal_progress.h"
5253
#include "opal/dss/dss.h"
@@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
111112
bool found = false;
112113
orte_node_t *node;
113114
orte_grpcomm_signature_t *sig;
115+
FILE *fp;
116+
char gscmd[256], path[1035], *pathptr;
117+
char string[256], *string_ptr = string;
114118

115119
/* unpack the command */
116120
n = 1;
@@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
10711075
}
10721076
break;
10731077

1078+
case ORTE_DAEMON_GET_STACK_TRACES:
1079+
/* prep the response */
1080+
answer = OBJ_NEW(opal_buffer_t);
1081+
pathptr = path;
1082+
1083+
// Try to find the "gstack" executable. Failure to find the
1084+
// executable will be handled below, because the receiver
1085+
// expects to have the process name, hostname, and PID in the
1086+
// buffer before finding an error message.
1087+
char *gstack_exec;
1088+
gstack_exec = opal_find_absolute_path("gstack");
1089+
1090+
/* hit each local process with a gstack command */
1091+
for (i=0; i < orte_local_children->size; i++) {
1092+
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1093+
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1094+
relay_msg = OBJ_NEW(opal_buffer_t);
1095+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
1096+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
1097+
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
1098+
OBJ_RELEASE(relay_msg);
1099+
break;
1100+
}
1101+
1102+
// If we were able to find the gstack executable,
1103+
// above, then run the command here.
1104+
fp = NULL;
1105+
if (NULL != gstack_exec) {
1106+
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
1107+
gstack_exec, (unsigned long) proct->pid);
1108+
fp = popen(gscmd, "r");
1109+
}
1110+
1111+
// If either we weren't able to find or run the gstack
1112+
// exectuable, send back a nice error message here.
1113+
if (NULL == gstack_exec || NULL == fp) {
1114+
(void) snprintf(string, sizeof(string),
1115+
"Failed to %s \"%s\" on %s to obtain stack traces",
1116+
(NULL == gstack_exec) ? "find" : "run",
1117+
(NULL == gstack_exec) ? "gstack" : gstack_exec,
1118+
proct->node->name);
1119+
if (OPAL_SUCCESS ==
1120+
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
1121+
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
1122+
}
1123+
OBJ_RELEASE(relay_msg);
1124+
break;
1125+
}
1126+
/* Read the output a line at a time and pack it for transmission */
1127+
memset(path, 0, sizeof(path));
1128+
while (fgets(path, sizeof(path)-1, fp) != NULL) {
1129+
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
1130+
OBJ_RELEASE(relay_msg);
1131+
break;
1132+
}
1133+
memset(path, 0, sizeof(path));
1134+
}
1135+
/* close */
1136+
pclose(fp);
1137+
/* transfer this load */
1138+
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
1139+
OBJ_RELEASE(relay_msg);
1140+
break;
1141+
}
1142+
OBJ_RELEASE(relay_msg);
1143+
}
1144+
}
1145+
/* always send our response */
1146+
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1147+
ORTE_RML_TAG_STACK_TRACE,
1148+
orte_rml_send_callback, NULL))) {
1149+
ORTE_ERROR_LOG(ret);
1150+
OBJ_RELEASE(answer);
1151+
}
1152+
break;
1153+
10741154
default:
10751155
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
10761156
}
@@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
11391219
case ORTE_DAEMON_DVM_ADD_PROCS:
11401220
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
11411221

1222+
case ORTE_DAEMON_GET_STACK_TRACES:
1223+
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
1224+
11421225
default:
11431226
return strdup("Unknown Command!");
11441227
}

0 commit comments

Comments
 (0)