Skip to content

Commit 56fe714

Browse files
Merge pull request #4637 from ggouaillardet/topic/tree_spawn_no_regex
orted: fix tree-spawn when the node regex is too long
2 parents ef38ca5 + 03da521 commit 56fe714

File tree

15 files changed

+145
-78
lines changed

15 files changed

+145
-78
lines changed

orte/mca/odls/odls_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
1515
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2018 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -44,7 +46,6 @@ typedef uint8_t orte_daemon_cmd_flag_t;
4446
#define ORTE_DAEMON_KILL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 2
4547
#define ORTE_DAEMON_SIGNAL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 3
4648
#define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4
47-
#define ORTE_DAEMON_TREE_SPAWN (orte_daemon_cmd_flag_t) 5
4849
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 6
4950
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 7
5051
#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 9

orte/mca/plm/base/plm_base_frame.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14-
* Copyright (c) 2015 Research Organization for Information Science
14+
* Copyright (c) 2015-2017 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
1717
*
@@ -52,6 +52,19 @@ orte_plm_globals_t orte_plm_globals = {0};
5252
orte_plm_base_module_t orte_plm = {0};
5353

5454

55+
static int mca_plm_base_register(mca_base_register_flag_t flags)
56+
{
57+
orte_plm_globals.node_regex_threshold = 1024;
58+
(void) mca_base_var_register("orte", "pml", "base", "node_regex_threshold",
59+
"Only pass the node regex on the orted command line if smaller than this threshold",
60+
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0,
61+
MCA_BASE_VAR_FLAG_INTERNAL,
62+
OPAL_INFO_LVL_9,
63+
MCA_BASE_VAR_SCOPE_READONLY,
64+
&orte_plm_globals.node_regex_threshold);
65+
return ORTE_SUCCESS;
66+
}
67+
5568
static int orte_plm_base_close(void)
5669
{
5770
int rc;
@@ -88,5 +101,5 @@ static int orte_plm_base_open(mca_base_open_flag_t flags)
88101
return mca_base_framework_components_open(&orte_plm_base_framework, flags);
89102
}
90103

91-
MCA_BASE_FRAMEWORK_DECLARE(orte, plm, NULL, NULL, orte_plm_base_open, orte_plm_base_close,
104+
MCA_BASE_FRAMEWORK_DECLARE(orte, plm, NULL, mca_plm_base_register, orte_plm_base_open, orte_plm_base_close,
92105
mca_plm_base_static_components, 0);

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,16 +1565,19 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
15651565
ORTE_ERROR_LOG(rc);
15661566
return rc;
15671567
}
1568+
if (NULL != orte_node_regex) {
1569+
free(orte_node_regex);
1570+
}
1571+
orte_node_regex = param;
15681572
/* if this is too long, then we'll have to do it with
15691573
* a phone home operation instead */
1570-
if (strlen(param) < ORTE_MAX_REGEX_CMD_LENGTH) {
1574+
if (strlen(param) < orte_plm_globals.node_regex_threshold) {
15711575
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
15721576
opal_argv_append(argc, argv, "orte_node_regex");
1573-
opal_argv_append(argc, argv, param);
1577+
opal_argv_append(argc, argv, orte_node_regex);
15741578
/* mark that the nidmap has been communicated */
15751579
orte_nidmap_communicated = true;
15761580
}
1577-
free(param);
15781581

15791582
if (!orte_static_ports && !orte_fwd_mpirun_port) {
15801583
/* if we are using static ports, or we are forwarding

orte/mca/plm/base/plm_private.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
1414
* Copyright (c) 2017 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2017 Research Organization for Information Science
16+
* and Technology (RIST). All rights reserved.
1517
* $COPYRIGHT$
1618
*
1719
* Additional copyrights may follow
@@ -59,6 +61,7 @@ typedef struct {
5961
opal_buffer_t tree_spawn_cmd;
6062
/* daemon nodes assigned at launch */
6163
bool daemon_nodes_assigned_at_launch;
64+
size_t node_regex_threshold;
6265
} orte_plm_globals_t;
6366
/**
6467
* Global instance of PLM framework data

orte/mca/plm/isolated/plm_isolated.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
1616
* Copyright (c) 2011 IBM Corporation. All rights reserved.
1717
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2017 Research Organization for Information Science
19+
* and Technology (RIST). All rights reserved.
1820
* $COPYRIGHT$
1921
*
2022
* Additional copyrights may follow
@@ -50,7 +52,7 @@
5052

5153
static int isolated_init(void);
5254
static int isolated_launch(orte_job_t *jdata);
53-
static int remote_spawn(opal_buffer_t *launch);
55+
static int remote_spawn();
5456
static int isolated_terminate_orteds(void);
5557
static int isolated_finalize(void);
5658

@@ -93,7 +95,7 @@ static int isolated_init(void)
9395
/*
9496
* launch a set of daemons from a remote daemon
9597
*/
96-
static int remote_spawn(opal_buffer_t *launch)
98+
static int remote_spawn()
9799
{
98100
/* unused function in this mode */
99101
return ORTE_SUCCESS;

orte/mca/plm/plm.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
* All rights reserved.
1313
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 Research Organization for Information Science
16+
* and Technology (RIST). All rights reserved.
1517
* $COPYRIGHT$
1618
*
1719
* Additional copyrights may follow
@@ -63,7 +65,7 @@ typedef int (*orte_plm_base_module_spawn_fn_t)(orte_job_t *jdata);
6365
/*
6466
* Remote spawn - spawn called by a daemon to launch a process on its own
6567
*/
66-
typedef int (*orte_plm_base_module_remote_spawn_fn_t)(opal_buffer_t *launch);
68+
typedef int (*orte_plm_base_module_remote_spawn_fn_t)(void);
6769

6870
/*
6971
* Entry point to set the HNP name

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 12 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@
101101

102102
static int rsh_init(void);
103103
static int rsh_launch(orte_job_t *jdata);
104-
static int remote_spawn(opal_buffer_t *launch);
104+
static int remote_spawn(void);
105105
static int rsh_terminate_orteds(void);
106106
static int rsh_finalize(void);
107107

@@ -263,7 +263,8 @@ static int rsh_init(void)
263263
static void rsh_wait_daemon(int sd, short flags, void *cbdata)
264264
{
265265
orte_job_t *jdata;
266-
orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
266+
orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
267+
orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)t2->cbdata;
267268
orte_proc_t *daemon = caddy->daemon;
268269
char *rtmod;
269270

@@ -272,6 +273,7 @@ static void rsh_wait_daemon(int sd, short flags, void *cbdata)
272273
* session attached, e.g., while debugging
273274
*/
274275
OBJ_RELEASE(caddy);
276+
OBJ_RELEASE(t2);
275277
return;
276278
}
277279

@@ -325,7 +327,7 @@ static void rsh_wait_daemon(int sd, short flags, void *cbdata)
325327
opal_event_active(&launch_event, EV_WRITE, 1);
326328
}
327329
/* cleanup */
328-
OBJ_RELEASE(caddy);
330+
OBJ_RELEASE(t2);
329331
}
330332

331333
static int setup_launch(int *argcptr, char ***argvptr,
@@ -782,7 +784,7 @@ static void ssh_child(int argc, char **argv)
782784
/*
783785
* launch a set of daemons from a remote daemon
784786
*/
785-
static int remote_spawn(opal_buffer_t *launch)
787+
static int remote_spawn()
786788
{
787789
int node_name_index1;
788790
int proc_vpid_index;
@@ -791,7 +793,6 @@ static int remote_spawn(opal_buffer_t *launch)
791793
int argc;
792794
int rc=ORTE_SUCCESS;
793795
bool failed_launch = true;
794-
orte_std_cntr_t n;
795796
orte_process_name_t target;
796797
orte_plm_rsh_caddy_t *caddy;
797798
orte_job_t *daemons;
@@ -806,22 +807,13 @@ static int remote_spawn(opal_buffer_t *launch)
806807
/* if we hit any errors, tell the HNP it was us */
807808
target.vpid = ORTE_PROC_MY_NAME->vpid;
808809

809-
if (NULL != launch) {
810-
/* extract the prefix from the launch buffer */
811-
n = 1;
812-
if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) {
813-
ORTE_ERROR_LOG(rc);
814-
goto cleanup;
815-
}
810+
/* check to see if enable-orterun-prefix-by-default was given - if
811+
* this is being done by a singleton, then orterun will not be there
812+
* to put the prefix in the app. So make sure we check to find it */
813+
if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
814+
prefix = strdup(opal_install_dirs.prefix);
816815
} else {
817-
/* check to see if enable-orterun-prefix-by-default was given - if
818-
* this is being done by a singleton, then orterun will not be there
819-
* to put the prefix in the app. So make sure we check to find it */
820-
if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
821-
prefix = strdup(opal_install_dirs.prefix);
822-
} else {
823-
prefix = NULL;
824-
}
816+
prefix = NULL;
825817
}
826818

827819
/* get the updated routing list */
@@ -1179,24 +1171,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
11791171

11801172
/* if we are tree launching, find our children and create the launch cmd */
11811173
if (!mca_plm_rsh_component.no_tree_spawn) {
1182-
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;
11831174
orte_job_t *jdatorted;
11841175

1185-
/* get the tree spawn buffer */
1186-
orte_tree_launch_cmd = OBJ_NEW(opal_buffer_t);
1187-
/* insert the tree_spawn cmd */
1188-
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) {
1189-
ORTE_ERROR_LOG(rc);
1190-
OBJ_RELEASE(orte_tree_launch_cmd);
1191-
goto cleanup;
1192-
}
1193-
/* pack the prefix since this will be needed by the next wave */
1194-
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &prefix_dir, 1, OPAL_STRING))) {
1195-
ORTE_ERROR_LOG(rc);
1196-
OBJ_RELEASE(orte_tree_launch_cmd);
1197-
goto cleanup;
1198-
}
1199-
12001176
/* get the orted job data object */
12011177
if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
12021178
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);

orte/mca/rml/base/rml_base_msg_handlers.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2017 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -172,8 +174,32 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
172174

173175
/* if this message is just to warmup the connection, then drop it */
174176
if (ORTE_RML_TAG_WARMUP_CONNECTION == msg->tag) {
175-
OBJ_RELEASE(msg);
176-
return;
177+
if (!orte_nidmap_communicated) {
178+
opal_buffer_t * buffer = OBJ_NEW(opal_buffer_t);
179+
int rc;
180+
if (NULL == buffer) {
181+
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
182+
return;
183+
}
184+
assert (NULL != orte_node_regex);
185+
186+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &orte_node_regex, 1, OPAL_STRING))) {
187+
ORTE_ERROR_LOG(rc);
188+
OBJ_RELEASE(buffer);
189+
return;
190+
}
191+
192+
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
193+
&msg->sender, buffer,
194+
ORTE_RML_TAG_NODE_REGEX_REPORT,
195+
orte_rml_send_callback, NULL))) {
196+
ORTE_ERROR_LOG(rc);
197+
OBJ_RELEASE(buffer);
198+
return;
199+
}
200+
OBJ_RELEASE(msg);
201+
return;
202+
}
177203
}
178204

179205
/* see if we have a waiting recv for this message */

orte/mca/rml/rml_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* reserved.
1414
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2017 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -175,6 +177,9 @@ BEGIN_C_DECLS
175177
/* warmup connection - simply establishes the connection */
176178
#define ORTE_RML_TAG_WARMUP_CONNECTION 63
177179

180+
/* node regex report */
181+
#define ORTE_RML_TAG_NODE_REGEX_REPORT 64
182+
178183
#define ORTE_RML_TAG_MAX 100
179184

180185

orte/mca/sstore/stage/sstore_stage_local.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
/*
2-
* Copyright (c) 2010 The Trustees of Indiana University.
2+
* Copyright (c) 2010 The Trustees of Indiana University.
33
* All rights reserved.
44
* Copyright (c) 2004-2011 The University of Tennessee and The University
55
* of Tennessee Research Foundation. All rights
66
* reserved.
7+
* Copyright (c) 2017 Research Organization for Information Science
8+
* and Technology (RIST). All rights reserved.
79
* $COPYRIGHT$
810
*
911
* Additional copyrights may follow
@@ -1550,8 +1552,9 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
15501552
static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata)
15511553
{
15521554
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
1555+
orte_wait_tracker_t *t2 = (orte_wait_tracker_t *)cbdata;
15531556

1554-
app_info = (orte_sstore_stage_local_app_snapshot_info_t*)cbdata;
1557+
app_info = (orte_sstore_stage_local_app_snapshot_info_t*)t2->cbdata;
15551558

15561559
OPAL_OUTPUT_VERBOSE((10, mca_sstore_stage_component.super.output_handle,
15571560
"sstore:stage:(local): waitpid(%6d) Compression finished for Process %s",
@@ -1560,6 +1563,7 @@ static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbda
15601563

15611564
app_info->compress_pid = 0;
15621565
OBJ_RELEASE(proc);
1566+
OBJ_RELEASE(t2);
15631567
}
15641568

15651569
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)

0 commit comments

Comments
 (0)