Skip to content

Commit 10de9c7

Browse files
author
rhc54
committed
Merge pull request #1480 from rhc54/topic/usock
Fix debugger operations and show_help aggregation
2 parents e020566 + c146c49 commit 10de9c7

22 files changed

+3748
-162
lines changed

ompi/mca/rte/orte/rte_orte.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
9191
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
9292

9393
/* Init and finalize objects and operations */
94-
OMPI_DECLSPEC int ompi_rte_init(int *pargc, char ***pargv);
94+
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
9595
#define ompi_rte_finalize() orte_finalize()
9696
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
9797

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 21 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -52,79 +52,6 @@
5252

5353
extern ompi_rte_orte_component_t mca_rte_orte_component;
5454

55-
typedef struct {
56-
volatile bool active;
57-
int status;
58-
int errhandler;
59-
} errhandler_t;
60-
61-
static void register_cbfunc(int status, int errhndler, void *cbdata)
62-
{
63-
errhandler_t *cd = (errhandler_t*)cbdata;
64-
cd->status = status;
65-
cd->errhandler = errhndler;
66-
cd->active = false;
67-
}
68-
69-
static volatile bool wait_for_release = true;
70-
static int errhandler = -1;
71-
72-
static void notify_cbfunc(int status,
73-
opal_list_t *procs,
74-
opal_list_t *info,
75-
opal_pmix_release_cbfunc_t cbfunc,
76-
void *cbdata)
77-
{
78-
if (NULL != cbfunc) {
79-
cbfunc(cbdata);
80-
}
81-
wait_for_release = false;
82-
}
83-
84-
85-
int ompi_rte_init(int *pargc, char ***pargv)
86-
{
87-
int rc;
88-
opal_list_t info;
89-
opal_value_t val;
90-
errhandler_t cd;
91-
92-
if (ORTE_SUCCESS != (rc = orte_init(pargc, pargv, ORTE_PROC_MPI))) {
93-
return rc;
94-
}
95-
96-
if (!orte_standalone_operation) {
97-
/* register to receive any debugger release */
98-
OBJ_CONSTRUCT(&info, opal_list_t);
99-
OBJ_CONSTRUCT(&val, opal_value_t);
100-
val.key = strdup(OPAL_PMIX_ERROR_NAME);
101-
val.type = OPAL_INT;
102-
val.data.integer = OPAL_ERR_DEBUGGER_RELEASE;
103-
opal_list_append(&info, &val.super);
104-
cd.status = ORTE_ERROR;
105-
cd.errhandler = -1;
106-
cd.active = true;
107-
108-
opal_pmix.register_errhandler(&info, notify_cbfunc, register_cbfunc, &cd);
109-
110-
/* let the MPI progress engine run while we wait for
111-
* registration to complete */
112-
OMPI_WAIT_FOR_COMPLETION(cd.active);
113-
/* safely deconstruct the list */
114-
opal_list_remove_first(&info);
115-
OBJ_DESTRUCT(&val);
116-
OBJ_DESTRUCT(&info);
117-
if (OPAL_SUCCESS != cd.status) {
118-
/* ouch - we are doomed */
119-
ORTE_ERROR_LOG(cd.status);
120-
return OMPI_ERROR;
121-
}
122-
errhandler = cd.errhandler;
123-
}
124-
125-
return OMPI_SUCCESS;
126-
}
127-
12855
void ompi_rte_abort(int error_code, char *fmt, ...)
12956
{
13057
va_list arglist;
@@ -173,10 +100,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
173100
* attaching debuggers -- see big comment in
174101
* orte/tools/orterun/debuggers.c explaining the two scenarios.
175102
*/
176-
177103
void ompi_rte_wait_for_debugger(void)
178104
{
179105
int debugger;
106+
orte_rml_recv_cb_t xfer;
180107

181108
/* See lengthy comment in orte/tools/orterun/debuggers.c about
182109
orte_in_parallel_debugger */
@@ -186,16 +113,16 @@ void ompi_rte_wait_for_debugger(void)
186113
debugger = 1;
187114
}
188115

189-
if (!debugger) {
116+
if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
190117
/* if not, just return */
191118
return;
192119
}
120+
193121
/* if we are being debugged, then we need to find
194122
* the correct plug-ins
195123
*/
196124
ompi_debugger_setup_dlls();
197125

198-
/* wait for the debugger to attach */
199126
if (orte_standalone_operation) {
200127
/* spin until debugger attaches and releases us */
201128
while (MPIR_debug_gate == 0) {
@@ -206,9 +133,23 @@ void ompi_rte_wait_for_debugger(void)
206133
#endif
207134
}
208135
} else {
209-
/* now wait for the notification to occur */
210-
OMPI_WAIT_FOR_COMPLETION(wait_for_release);
211-
/* deregister the errhandler */
212-
opal_pmix.deregister_errhandler(errhandler, NULL, NULL);
136+
/* only the rank=0 proc waits for either a message from the
137+
* HNP or for the debugger to attach - everyone else will just
138+
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139+
* joins them.
140+
*/
141+
if (0 != ORTE_PROC_MY_NAME->vpid) {
142+
return;
143+
}
144+
145+
/* VPID 0 waits for a message from the HNP */
146+
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
147+
xfer.active = true;
148+
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
149+
ORTE_RML_TAG_DEBUGGER_RELEASE,
150+
ORTE_RML_NON_PERSISTENT,
151+
orte_rml_recv_callback, &xfer);
152+
/* let the MPI progress engine run while we wait */
153+
OMPI_WAIT_FOR_COMPLETION(xfer.active);
213154
}
214155
}

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,14 @@
4747
#include "opal/runtime/opal.h"
4848
#include "opal/runtime/opal_cr.h"
4949

50+
#include "orte/mca/rml/base/base.h"
51+
#include "orte/mca/routed/base/base.h"
5052
#include "orte/mca/errmgr/errmgr.h"
5153
#include "orte/mca/dfs/base/base.h"
5254
#include "orte/mca/grpcomm/base/base.h"
55+
#include "orte/mca/oob/base/base.h"
56+
#include "orte/mca/rml/rml.h"
57+
#include "orte/mca/qos/base/base.h"
5358
#include "orte/mca/odls/odls_types.h"
5459
#include "orte/mca/filem/base/base.h"
5560
#include "orte/mca/errmgr/base/base.h"
@@ -169,14 +174,84 @@ int orte_ess_base_app_setup(bool db_restrict_local)
169174
}
170175
OBJ_DESTRUCT(&kv);
171176
}
172-
177+
/* Setup the communication infrastructure */
178+
/*
179+
* OOB Layer
180+
*/
181+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
182+
ORTE_ERROR_LOG(ret);
183+
error = "orte_oob_base_open";
184+
goto error;
185+
}
186+
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
187+
ORTE_ERROR_LOG(ret);
188+
error = "orte_oob_base_select";
189+
goto error;
190+
}
191+
/* Runtime Messaging Layer */
192+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
193+
ORTE_ERROR_LOG(ret);
194+
error = "orte_rml_base_open";
195+
goto error;
196+
}
197+
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
198+
ORTE_ERROR_LOG(ret);
199+
error = "orte_rml_base_select";
200+
goto error;
201+
}
202+
/* Messaging QoS Layer */
203+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) {
204+
ORTE_ERROR_LOG(ret);
205+
error = "orte_qos_base_open";
206+
goto error;
207+
}
208+
if (ORTE_SUCCESS != (ret = orte_qos_base_select())) {
209+
ORTE_ERROR_LOG(ret);
210+
error = "orte_qos_base_select";
211+
goto error;
212+
}
173213
/* setup the errmgr */
174214
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
175215
ORTE_ERROR_LOG(ret);
176216
error = "orte_errmgr_base_select";
177217
goto error;
178218
}
179-
219+
/* Routed system */
220+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
221+
ORTE_ERROR_LOG(ret);
222+
error = "orte_routed_base_open";
223+
goto error;
224+
}
225+
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
226+
ORTE_ERROR_LOG(ret);
227+
error = "orte_routed_base_select";
228+
goto error;
229+
}
230+
/*
231+
* Group communications
232+
*/
233+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
234+
ORTE_ERROR_LOG(ret);
235+
error = "orte_grpcomm_base_open";
236+
goto error;
237+
}
238+
if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
239+
ORTE_ERROR_LOG(ret);
240+
error = "orte_grpcomm_base_select";
241+
goto error;
242+
}
243+
/* enable communication via the rml */
244+
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
245+
ORTE_ERROR_LOG(ret);
246+
error = "orte_rml.enable_comm";
247+
goto error;
248+
}
249+
/* setup the routed info */
250+
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
251+
ORTE_ERROR_LOG(ret);
252+
error = "orte_routed.init_routes";
253+
goto error;
254+
}
180255
#if OPAL_ENABLE_FT_CR == 1
181256
/*
182257
* Setup the SnapC
@@ -247,7 +322,13 @@ int orte_ess_base_app_finalize(void)
247322
(void) mca_base_framework_close(&orte_filem_base_framework);
248323
(void) mca_base_framework_close(&orte_errmgr_base_framework);
249324

325+
/* now can close the rml and its friendly group comm */
326+
(void) mca_base_framework_close(&orte_grpcomm_base_framework);
250327
(void) mca_base_framework_close(&orte_dfs_base_framework);
328+
(void) mca_base_framework_close(&orte_routed_base_framework);
329+
330+
(void) mca_base_framework_close(&orte_rml_base_framework);
331+
(void) mca_base_framework_close(&orte_oob_base_framework);
251332
(void) mca_base_framework_close(&orte_state_base_framework);
252333

253334
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
@@ -296,7 +377,7 @@ void orte_ess_base_app_abort(int status, bool report)
296377
* the message if routing is enabled as this indicates we
297378
* have someone to send to
298379
*/
299-
if (report && orte_create_session_dirs) {
380+
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
300381
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
301382
fd = open(myfile, O_CREAT, S_IRUSR);
302383
close(fd);

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
#include "opal/mca/pmix/base/base.h"
4949

5050
#include "orte/mca/errmgr/errmgr.h"
51+
#include "orte/mca/grpcomm/grpcomm.h"
52+
#include "orte/mca/rml/rml.h"
5153
#include "orte/util/proc_info.h"
5254
#include "orte/util/show_help.h"
5355
#include "orte/util/name_fns.h"
@@ -85,6 +87,7 @@ static int rte_init(void)
8587
char *envar, *ev1, *ev2;
8688
uint64_t unique_key[2];
8789
char *string_key;
90+
char *rmluri;
8891
opal_value_t *kv;
8992
char *val;
9093
int u32, *u32ptr;
@@ -379,6 +382,16 @@ static int rte_init(void)
379382

380383
/*** PUSH DATA FOR OTHERS TO FIND ***/
381384

385+
/* push our RML URI in case others need to talk directly to us */
386+
rmluri = orte_rml.get_contact_info();
387+
/* push it out for others to use */
388+
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING);
389+
if (ORTE_SUCCESS != ret) {
390+
error = "pmix put uri";
391+
goto error;
392+
}
393+
free(rmluri);
394+
382395
/* push our hostname so others can find us, if they need to */
383396
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
384397
if (ORTE_SUCCESS != ret) {

orte/mca/oob/usock/Makefile.am

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#
2+
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3+
# University Research and Technology
4+
# Corporation. All rights reserved.
5+
# Copyright (c) 2004-2005 The University of Tennessee and The University
6+
# of Tennessee Research Foundation. All rights
7+
# reserved.
8+
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
9+
# University of Stuttgart. All rights reserved.
10+
# Copyright (c) 2004-2005 The Regents of the University of California.
11+
# All rights reserved.
12+
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
14+
# All rights reserved
15+
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
16+
# $COPYRIGHT$
17+
#
18+
# Additional copyrights may follow
19+
#
20+
# $HEADER$
21+
#
22+
23+
sources = \
24+
oob_usock_component.h \
25+
oob_usock.h \
26+
oob_usock_component.c \
27+
oob_usock_connection.h \
28+
oob_usock_sendrecv.h \
29+
oob_usock_hdr.h \
30+
oob_usock_peer.h \
31+
oob_usock_ping.h \
32+
oob_usock.c \
33+
oob_usock_connection.c \
34+
oob_usock_sendrecv.c
35+
36+
# Make the output library in this directory, and name it either
37+
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
38+
# (for static builds).
39+
40+
if MCA_BUILD_orte_oob_usock_DSO
41+
component_noinst =
42+
component_install = mca_oob_usock.la
43+
else
44+
component_noinst = libmca_oob_usock.la
45+
component_install =
46+
endif
47+
48+
mcacomponentdir = $(ortelibdir)
49+
mcacomponent_LTLIBRARIES = $(component_install)
50+
mca_oob_usock_la_SOURCES = $(sources)
51+
mca_oob_usock_la_LDFLAGS = -module -avoid-version
52+
53+
noinst_LTLIBRARIES = $(component_noinst)
54+
libmca_oob_usock_la_SOURCES = $(sources)
55+
libmca_oob_usock_la_LDFLAGS = -module -avoid-version
56+

0 commit comments

Comments
 (0)