Skip to content

Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests #1724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion orte/mca/odls/odls_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
Expand Down Expand Up @@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* add procs for the DVM */
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30

/* for debug purposes, get stack traces from all application procs */
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31

/*
* Struct written up the pipe from the child to the parent.
*/
Expand Down
3 changes: 3 additions & 0 deletions orte/mca/rml/rml_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ BEGIN_C_DECLS
/* error notifications */
#define ORTE_RML_TAG_NOTIFICATION 59

/* stacktrace for debug */
#define ORTE_RML_TAG_STACK_TRACE 60

#define ORTE_RML_TAG_MAX 100

#define ORTE_RML_TAG_NTOH(t) ntohl(t)
Expand Down
14 changes: 13 additions & 1 deletion orte/mca/schizo/ompi/schizo_ompi.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
Expand Down Expand Up @@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },

/* testing options */
{ NULL, '\0', "timeout", "timeout", 1,
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
"Timeout the job after the specified number of seconds" },
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
"Report all job and process states upon timeout" },
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
"Get stack traces of all application procs on timeout" },


/* exit status reporting */
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
Expand Down
85 changes: 84 additions & 1 deletion orte/orted/orted_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
Expand Down Expand Up @@ -47,6 +47,7 @@
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress.h"
#include "opal/dss/dss.h"
Expand Down Expand Up @@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
bool found = false;
orte_node_t *node;
orte_grpcomm_signature_t *sig;
FILE *fp;
char gscmd[256], path[1035], *pathptr;
char string[256], *string_ptr = string;

/* unpack the command */
n = 1;
Expand Down Expand Up @@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
break;

case ORTE_DAEMON_GET_STACK_TRACES:
/* prep the response */
answer = OBJ_NEW(opal_buffer_t);
pathptr = path;

// Try to find the "gstack" executable. Failure to find the
// executable will be handled below, because the receiver
// expects to have the process name, hostname, and PID in the
// buffer before finding an error message.
char *gstack_exec;
gstack_exec = opal_find_absolute_path("gstack");

/* hit each local process with a gstack command */
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
relay_msg = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
OBJ_RELEASE(relay_msg);
break;
}

// If we were able to find the gstack executable,
// above, then run the command here.
fp = NULL;
if (NULL != gstack_exec) {
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
gstack_exec, (unsigned long) proct->pid);
fp = popen(gscmd, "r");
}

// If either we weren't able to find or run the gstack
// exectuable, send back a nice error message here.
if (NULL == gstack_exec || NULL == fp) {
(void) snprintf(string, sizeof(string),
"Failed to %s \"%s\" on %s to obtain stack traces",
(NULL == gstack_exec) ? "find" : "run",
(NULL == gstack_exec) ? "gstack" : gstack_exec,
proct->node->name);
if (OPAL_SUCCESS ==
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
}
OBJ_RELEASE(relay_msg);
break;
}
/* Read the output a line at a time and pack it for transmission */
memset(path, 0, sizeof(path));
while (fgets(path, sizeof(path)-1, fp) != NULL) {
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
OBJ_RELEASE(relay_msg);
break;
}
memset(path, 0, sizeof(path));
}
/* close */
pclose(fp);
/* transfer this load */
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
OBJ_RELEASE(relay_msg);
break;
}
OBJ_RELEASE(relay_msg);
}
}
/* always send our response */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
ORTE_RML_TAG_STACK_TRACE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
break;

default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
Expand Down Expand Up @@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
case ORTE_DAEMON_DVM_ADD_PROCS:
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");

case ORTE_DAEMON_GET_STACK_TRACES:
return strdup("ORTE_DAEMON_GET_STACK_TRACES");

default:
return strdup("Unknown Command!");
}
Expand Down
Loading