|
9 | 9 | * University of Stuttgart. All rights reserved.
|
10 | 10 | * Copyright (c) 2004-2005 The Regents of the University of California.
|
11 | 11 | * All rights reserved.
|
12 |
| - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. |
| 12 | + * Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. |
13 | 13 | * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
14 | 14 | * reserved.
|
15 | 15 | * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
|
47 | 47 | #include "opal/mca/base/base.h"
|
48 | 48 | #include "opal/util/output.h"
|
49 | 49 | #include "opal/util/opal_environ.h"
|
| 50 | +#include "opal/util/path.h" |
50 | 51 | #include "opal/runtime/opal.h"
|
51 | 52 | #include "opal/runtime/opal_progress.h"
|
52 | 53 | #include "opal/dss/dss.h"
|
@@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
111 | 112 | bool found = false;
|
112 | 113 | orte_node_t *node;
|
113 | 114 | orte_grpcomm_signature_t *sig;
|
| 115 | + FILE *fp; |
| 116 | + char gscmd[256], path[1035], *pathptr; |
| 117 | + char string[256], *string_ptr = string; |
114 | 118 |
|
115 | 119 | /* unpack the command */
|
116 | 120 | n = 1;
|
@@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
1071 | 1075 | }
|
1072 | 1076 | break;
|
1073 | 1077 |
|
| 1078 | + case ORTE_DAEMON_GET_STACK_TRACES: |
| 1079 | + /* prep the response */ |
| 1080 | + answer = OBJ_NEW(opal_buffer_t); |
| 1081 | + pathptr = path; |
| 1082 | + |
| 1083 | + // Try to find the "gstack" executable. Failure to find the |
| 1084 | + // executable will be handled below, because the receiver |
| 1085 | + // expects to have the process name, hostname, and PID in the |
| 1086 | + // buffer before finding an error message. |
| 1087 | + char *gstack_exec; |
| 1088 | + gstack_exec = opal_find_absolute_path("gstack"); |
| 1089 | + |
| 1090 | + /* hit each local process with a gstack command */ |
| 1091 | + for (i=0; i < orte_local_children->size; i++) { |
| 1092 | + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && |
| 1093 | + ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { |
| 1094 | + relay_msg = OBJ_NEW(opal_buffer_t); |
| 1095 | + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) || |
| 1096 | + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) || |
| 1097 | + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) { |
| 1098 | + OBJ_RELEASE(relay_msg); |
| 1099 | + break; |
| 1100 | + } |
| 1101 | + |
| 1102 | + // If we were able to find the gstack executable, |
| 1103 | + // above, then run the command here. |
| 1104 | + fp = NULL; |
| 1105 | + if (NULL != gstack_exec) { |
| 1106 | + (void) snprintf(gscmd, sizeof(gscmd), "%s %lu", |
| 1107 | + gstack_exec, (unsigned long) proct->pid); |
| 1108 | + fp = popen(gscmd, "r"); |
| 1109 | + } |
| 1110 | + |
| 1111 | + // If either we weren't able to find or run the gstack |
| 1112 | + // exectuable, send back a nice error message here. |
| 1113 | + if (NULL == gstack_exec || NULL == fp) { |
| 1114 | + (void) snprintf(string, sizeof(string), |
| 1115 | + "Failed to %s \"%s\" on %s to obtain stack traces", |
| 1116 | + (NULL == gstack_exec) ? "find" : "run", |
| 1117 | + (NULL == gstack_exec) ? "gstack" : gstack_exec, |
| 1118 | + proct->node->name); |
| 1119 | + if (OPAL_SUCCESS == |
| 1120 | + opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) { |
| 1121 | + opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER); |
| 1122 | + } |
| 1123 | + OBJ_RELEASE(relay_msg); |
| 1124 | + break; |
| 1125 | + } |
| 1126 | + /* Read the output a line at a time and pack it for transmission */ |
| 1127 | + memset(path, 0, sizeof(path)); |
| 1128 | + while (fgets(path, sizeof(path)-1, fp) != NULL) { |
| 1129 | + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) { |
| 1130 | + OBJ_RELEASE(relay_msg); |
| 1131 | + break; |
| 1132 | + } |
| 1133 | + memset(path, 0, sizeof(path)); |
| 1134 | + } |
| 1135 | + /* close */ |
| 1136 | + pclose(fp); |
| 1137 | + /* transfer this load */ |
| 1138 | + if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) { |
| 1139 | + OBJ_RELEASE(relay_msg); |
| 1140 | + break; |
| 1141 | + } |
| 1142 | + OBJ_RELEASE(relay_msg); |
| 1143 | + } |
| 1144 | + } |
| 1145 | + /* always send our response */ |
| 1146 | + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer, |
| 1147 | + ORTE_RML_TAG_STACK_TRACE, |
| 1148 | + orte_rml_send_callback, NULL))) { |
| 1149 | + ORTE_ERROR_LOG(ret); |
| 1150 | + OBJ_RELEASE(answer); |
| 1151 | + } |
| 1152 | + break; |
| 1153 | + |
1074 | 1154 | default:
|
1075 | 1155 | ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
1076 | 1156 | }
|
@@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
|
1139 | 1219 | case ORTE_DAEMON_DVM_ADD_PROCS:
|
1140 | 1220 | return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
|
1141 | 1221 |
|
| 1222 | + case ORTE_DAEMON_GET_STACK_TRACES: |
| 1223 | + return strdup("ORTE_DAEMON_GET_STACK_TRACES"); |
| 1224 | + |
1142 | 1225 | default:
|
1143 | 1226 | return strdup("Unknown Command!");
|
1144 | 1227 | }
|
|
0 commit comments