|
9 | 9 | * University of Stuttgart. All rights reserved.
|
10 | 10 | * Copyright (c) 2004-2005 The Regents of the University of California.
|
11 | 11 | * All rights reserved.
|
12 |
| - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. |
| 12 | + * Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. |
13 | 13 | * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
14 | 14 | * reserved.
|
15 | 15 | * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
|
47 | 47 | #include "opal/mca/base/base.h"
|
48 | 48 | #include "opal/util/output.h"
|
49 | 49 | #include "opal/util/opal_environ.h"
|
| 50 | +#include "opal/util/path.h" |
50 | 51 | #include "opal/runtime/opal.h"
|
51 | 52 | #include "opal/runtime/opal_progress.h"
|
52 | 53 | #include "opal/dss/dss.h"
|
@@ -113,6 +114,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
113 | 114 | bool found = false;
|
114 | 115 | orte_node_t *node;
|
115 | 116 | orte_grpcomm_signature_t *sig;
|
| 117 | + FILE *fp; |
| 118 | + char gscmd[256], path[1035], *pathptr; |
| 119 | + char string[256], *string_ptr = string; |
116 | 120 |
|
117 | 121 | /* unpack the command */
|
118 | 122 | n = 1;
|
@@ -1137,6 +1141,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
1137 | 1141 | }
|
1138 | 1142 | break;
|
1139 | 1143 |
|
| 1144 | + case ORTE_DAEMON_GET_STACK_TRACES: |
| 1145 | + /* prep the response */ |
| 1146 | + answer = OBJ_NEW(opal_buffer_t); |
| 1147 | + pathptr = path; |
| 1148 | + |
| 1149 | + // Try to find the "gstack" executable. Failure to find the |
| 1150 | + // executable will be handled below, because the receiver |
| 1151 | + // expects to have the process name, hostname, and PID in the |
| 1152 | + // buffer before finding an error message. |
| 1153 | + char *gstack_exec; |
| 1154 | + gstack_exec = opal_find_absolute_path("gstack"); |
| 1155 | + |
| 1156 | + /* hit each local process with a gstack command */ |
| 1157 | + for (i=0; i < orte_local_children->size; i++) { |
| 1158 | + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && |
| 1159 | + ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { |
| 1160 | + relay_msg = OBJ_NEW(opal_buffer_t); |
| 1161 | + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) || |
| 1162 | + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) || |
| 1163 | + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) { |
| 1164 | + OBJ_RELEASE(relay_msg); |
| 1165 | + break; |
| 1166 | + } |
| 1167 | + |
| 1168 | + // If we were able to find the gstack executable, |
| 1169 | + // above, then run the command here. |
| 1170 | + fp = NULL; |
| 1171 | + if (NULL != gstack_exec) { |
| 1172 | + (void) snprintf(gscmd, sizeof(gscmd), "%s %lu", |
| 1173 | + gstack_exec, (unsigned long) proct->pid); |
| 1174 | + fp = popen(gscmd, "r"); |
| 1175 | + } |
| 1176 | + |
| 1177 | + // If either we weren't able to find or run the gstack |
| 1178 | + // exectuable, send back a nice error message here. |
| 1179 | + if (NULL == gstack_exec || NULL == fp) { |
| 1180 | + (void) snprintf(string, sizeof(string), |
| 1181 | + "Failed to %s \"%s\" on %s to obtain stack traces", |
| 1182 | + (NULL == gstack_exec) ? "find" : "run", |
| 1183 | + (NULL == gstack_exec) ? "gstack" : gstack_exec, |
| 1184 | + proct->node->name); |
| 1185 | + if (OPAL_SUCCESS == |
| 1186 | + opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) { |
| 1187 | + opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER); |
| 1188 | + } |
| 1189 | + OBJ_RELEASE(relay_msg); |
| 1190 | + break; |
| 1191 | + } |
| 1192 | + /* Read the output a line at a time and pack it for transmission */ |
| 1193 | + memset(path, 0, sizeof(path)); |
| 1194 | + while (fgets(path, sizeof(path)-1, fp) != NULL) { |
| 1195 | + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) { |
| 1196 | + OBJ_RELEASE(relay_msg); |
| 1197 | + break; |
| 1198 | + } |
| 1199 | + memset(path, 0, sizeof(path)); |
| 1200 | + } |
| 1201 | + /* close */ |
| 1202 | + pclose(fp); |
| 1203 | + /* transfer this load */ |
| 1204 | + if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) { |
| 1205 | + OBJ_RELEASE(relay_msg); |
| 1206 | + break; |
| 1207 | + } |
| 1208 | + OBJ_RELEASE(relay_msg); |
| 1209 | + } |
| 1210 | + } |
| 1211 | + /* always send our response */ |
| 1212 | + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer, |
| 1213 | + ORTE_RML_TAG_STACK_TRACE, |
| 1214 | + orte_rml_send_callback, NULL))) { |
| 1215 | + ORTE_ERROR_LOG(ret); |
| 1216 | + OBJ_RELEASE(answer); |
| 1217 | + } |
| 1218 | + break; |
| 1219 | + |
1140 | 1220 | default:
|
1141 | 1221 | ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
1142 | 1222 | }
|
@@ -1201,6 +1281,9 @@ static char *get_orted_comm_cmd_str(int command)
|
1201 | 1281 | case ORTE_DAEMON_NEW_COLL_ID:
|
1202 | 1282 | return strdup("ORTE_DAEMON_NEW_COLL_ID");
|
1203 | 1283 |
|
| 1284 | + case ORTE_DAEMON_GET_STACK_TRACES: |
| 1285 | + return strdup("ORTE_DAEMON_GET_STACK_TRACES"); |
| 1286 | + |
1204 | 1287 | default:
|
1205 | 1288 | return strdup("Unknown Command!");
|
1206 | 1289 | }
|
|
0 commit comments