From d7787a981f2fba61ce2e50c7ede8d5e950900a0e Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 13 Sep 2017 10:22:44 +0900 Subject: [PATCH 1/8] checkpoint --- ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 10 +- ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.h | 8 +- ompi/mca/crcp/crcp.h | 8 +- ompi/mca/pml/base/pml_base_recvreq.h | 3 + ompi/mca/pml/base/pml_base_request.h | 3 +- ompi/mca/pml/base/pml_base_sendreq.h | 3 +- ompi/mca/pml/bfo/pml_bfo.c | 6 ++ ompi/mca/pml/cm/pml_cm.c | 6 ++ ompi/mca/pml/crcpw/pml_crcpw.h | 8 +- ompi/mca/pml/crcpw/pml_crcpw_module.c | 9 +- ompi/mca/pml/example/pml_example.c | 6 ++ ompi/mca/pml/ob1/pml_ob1.c | 6 ++ ompi/mca/pml/ob1/pml_ob1.h | 31 +++++- ompi/mca/pml/ob1/pml_ob1_irecv.c | 78 +++++++++++++++ ompi/mca/pml/ob1/pml_ob1_isend.c | 136 +++++++++++++++++++++++++- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 127 +++++++++++++++++++++++- ompi/mca/pml/ob1/pml_ob1_recvreq.h | 11 ++- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 3 +- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 30 ++++++ ompi/mca/pml/pml.h | 89 ++++++++++++++++- ompi/mca/pml/ucx/pml_ucx.c | 6 ++ ompi/mca/pml/yalla/pml_yalla.c | 6 +- 22 files changed, 568 insertions(+), 25 deletions(-) diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c index 7d5e480095a..9b853970110 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c @@ -9,6 +9,8 @@ * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1497,7 +1499,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_del_procs( /**************** Send *****************/ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_isend_init( - void *buf, size_t count, + const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, @@ -1558,7 +1560,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_isend_init( * Update Message */ HOKE_CONTENT_REF_ALLOC(new_content); - new_content->buffer = buf; + new_content->buffer = (void *)buf; new_content->request = *request; new_content->done = false; new_content->active = false; @@ -1710,7 +1712,7 @@ static int ompi_crcp_bkmrk_request_complete_isend_init(struct ompi_request_t *re ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_isend( - void *buf, size_t count, + const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, @@ -1869,7 +1871,7 @@ static int ompi_crcp_bkmrk_request_complete_isend(struct ompi_request_t *request ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_send( - void *buf, size_t count, + const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.h b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.h index 11784213599..0841870aed0 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.h +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.h @@ -8,6 +8,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,21 +71,21 @@ BEGIN_C_DECLS ompi_crcp_base_pml_state_t* pml_state ); ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_isend_init - ( void *buf, size_t count, ompi_datatype_t *datatype, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request, ompi_crcp_base_pml_state_t* pml_state ); ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_isend - ( void *buf, size_t count, ompi_datatype_t *datatype, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request, ompi_crcp_base_pml_state_t* pml_state ); ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_send - ( void *buf, size_t count, ompi_datatype_t *datatype, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, ompi_crcp_base_pml_state_t* pml_state ); diff --git a/ompi/mca/crcp/crcp.h b/ompi/mca/crcp/crcp.h index ff43aa029c9..2dcdba0536e 100644 --- a/ompi/mca/crcp/crcp.h +++ b/ompi/mca/crcp/crcp.h @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -128,17 +130,17 @@ typedef ompi_crcp_base_pml_state_t* (*ompi_crcp_base_pml_probe_fn_t) ompi_status_public_t* status, ompi_crcp_base_pml_state_t* ); typedef ompi_crcp_base_pml_state_t* (*ompi_crcp_base_pml_isend_init_fn_t) - ( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request, ompi_crcp_base_pml_state_t* ); typedef ompi_crcp_base_pml_state_t* (*ompi_crcp_base_pml_isend_fn_t) - ( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request, ompi_crcp_base_pml_state_t* ); typedef ompi_crcp_base_pml_state_t* (*ompi_crcp_base_pml_send_fn_t) - ( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + ( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, ompi_crcp_base_pml_state_t* ); diff --git a/ompi/mca/pml/base/pml_base_recvreq.h b/ompi/mca/pml/base/pml_base_recvreq.h index d20663e12f2..57a92b7307f 100644 --- a/ompi/mca/pml/base/pml_base_recvreq.h +++ b/ompi/mca/pml/base/pml_base_recvreq.h @@ -13,6 +13,8 @@ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -74,6 +76,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_pml_base_recv_request_t); (request)->req_bytes_packed = 0; \ (request)->req_base.req_addr = addr; \ (request)->req_base.req_count = count; \ + (request)->req_base.req_offset = 0; \ (request)->req_base.req_peer = src; \ (request)->req_base.req_tag = tag; \ (request)->req_base.req_comm = comm; \ diff --git a/ompi/mca/pml/base/pml_base_request.h b/ompi/mca/pml/base/pml_base_request.h index 90a86505e07..2749dfffd0a 100644 --- a/ompi/mca/pml/base/pml_base_request.h +++ b/ompi/mca/pml/base/pml_base_request.h @@ -13,7 +13,7 @@ * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -72,6 +72,7 @@ struct mca_pml_base_request_t { void *req_addr; /**< pointer to application buffer */ size_t req_count; /**< count of user datatype elements */ + size_t req_offset; /**< offset the request starts at */ int32_t req_peer; /**< peer process - rank w/in this communicator */ int32_t req_tag; /**< user defined tag */ struct ompi_proc_t* req_proc; /**< peer process */ diff --git a/ompi/mca/pml/base/pml_base_sendreq.h b/ompi/mca/pml/base/pml_base_sendreq.h index 3f6cce1e578..ad95482af40 100644 --- a/ompi/mca/pml/base/pml_base_sendreq.h +++ b/ompi/mca/pml/base/pml_base_sendreq.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. @@ -87,6 +87,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION( mca_pml_base_send_request_t ); (request)->req_send_mode = mode; \ (request)->req_base.req_addr = (void *)addr; \ (request)->req_base.req_count = count; \ + (request)->req_base.req_offset= 0; \ (request)->req_base.req_datatype = datatype; \ (request)->req_base.req_peer = (int32_t)peer; \ (request)->req_base.req_tag = (int32_t)tag; \ diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c index e3a1beb447a..25afee4fd07 100644 --- a/ompi/mca/pml/bfo/pml_bfo.c +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -16,6 +16,8 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,9 +66,13 @@ mca_pml_bfo_t mca_pml_bfo = { mca_pml_bfo_irecv_init, mca_pml_bfo_irecv, mca_pml_bfo_recv, + NULL, /* icrecv */ + NULL, /* crecv */ mca_pml_bfo_isend_init, mca_pml_bfo_isend, mca_pml_bfo_send, + NULL, /* icsend */ + NULL, /* csend */ mca_pml_bfo_iprobe, mca_pml_bfo_probe, mca_pml_bfo_start, diff --git a/ompi/mca/pml/cm/pml_cm.c b/ompi/mca/pml/cm/pml_cm.c index a7322e4c331..5f9246be967 100644 --- a/ompi/mca/pml/cm/pml_cm.c +++ b/ompi/mca/pml/cm/pml_cm.c @@ -11,6 +11,8 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,9 +42,13 @@ ompi_pml_cm_t ompi_pml_cm = { mca_pml_cm_irecv_init, mca_pml_cm_irecv, mca_pml_cm_recv, + NULL, /* icrecv */ + NULL, /* crecv */ mca_pml_cm_isend_init, mca_pml_cm_isend, mca_pml_cm_send, + NULL, /* icsend */ + NULL, /* csend */ mca_pml_cm_iprobe, mca_pml_cm_probe, mca_pml_cm_start, diff --git a/ompi/mca/pml/crcpw/pml_crcpw.h b/ompi/mca/pml/crcpw/pml_crcpw.h index 478253c150f..6be9de78c74 100644 --- a/ompi/mca/pml/crcpw/pml_crcpw.h +++ b/ompi/mca/pml/crcpw/pml_crcpw.h @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,13 +110,13 @@ BEGIN_C_DECLS struct ompi_message_t **message, ompi_status_public_t* status ); - int mca_pml_crcpw_isend_init( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + int mca_pml_crcpw_isend_init( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request ); - int mca_pml_crcpw_isend( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + int mca_pml_crcpw_isend( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request ); - int mca_pml_crcpw_send( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + int mca_pml_crcpw_send( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm ); int mca_pml_crcpw_irecv_init( void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, diff --git a/ompi/mca/pml/crcpw/pml_crcpw_module.c b/ompi/mca/pml/crcpw/pml_crcpw_module.c index c5982c55383..1fa578a4868 100644 --- a/ompi/mca/pml/crcpw/pml_crcpw_module.c +++ b/ompi/mca/pml/crcpw/pml_crcpw_module.c @@ -13,6 +13,8 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,6 +48,7 @@ mca_pml_crcpw_module_t mca_pml_crcpw_module = { mca_pml_crcpw_isend_init, mca_pml_crcpw_isend, mca_pml_crcpw_send, + NULL, /* csend */ mca_pml_crcpw_iprobe, mca_pml_crcpw_probe, mca_pml_crcpw_start, @@ -368,7 +371,7 @@ int mca_pml_crcpw_probe( int dst, int tag, struct ompi_communicator_t* comm, omp return OMPI_SUCCESS; } -int mca_pml_crcpw_isend_init( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, +int mca_pml_crcpw_isend_init( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request ) { int ret; @@ -407,7 +410,7 @@ int mca_pml_crcpw_isend_init( void *buf, size_t count, ompi_datatype_t *datatype return OMPI_SUCCESS; } -int mca_pml_crcpw_isend( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, +int mca_pml_crcpw_isend( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm, struct ompi_request_t **request ) { int ret; @@ -449,7 +452,7 @@ int mca_pml_crcpw_isend( void *buf, size_t count, ompi_datatype_t *datatype, int return OMPI_SUCCESS; } -int mca_pml_crcpw_send( void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, +int mca_pml_crcpw_send( const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm ) { int ret; diff --git a/ompi/mca/pml/example/pml_example.c b/ompi/mca/pml/example/pml_example.c index 799e3abe459..e62d8c255f0 100644 --- a/ompi/mca/pml/example/pml_example.c +++ b/ompi/mca/pml/example/pml_example.c @@ -6,6 +6,8 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,9 +32,13 @@ mca_pml_example_t mca_pml_example = { mca_pml_example_irecv_init, mca_pml_example_irecv, mca_pml_example_recv, + NULL, /* icrecv */ + NULL, /* crecv */ mca_pml_example_isend_init, mca_pml_example_isend, mca_pml_example_send, + NULL, /* icsend */ + NULL, /* csend */ mca_pml_example_iprobe, mca_pml_example_probe, mca_pml_example_start, diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index eaf3ab26e0f..68fccc0ab9e 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -18,6 +18,8 @@ * reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,9 +66,13 @@ mca_pml_ob1_t mca_pml_ob1 = { mca_pml_ob1_irecv_init, mca_pml_ob1_irecv, mca_pml_ob1_recv, + mca_pml_ob1_icrecv, + mca_pml_ob1_crecv, mca_pml_ob1_isend_init, mca_pml_ob1_isend, mca_pml_ob1_send, + mca_pml_ob1_icsend, + mca_pml_ob1_csend, mca_pml_ob1_iprobe, mca_pml_ob1_probe, mca_pml_ob1_start, diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 4826587564a..65e84cf94c4 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -14,7 +14,7 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -165,6 +165,21 @@ extern int mca_pml_ob1_send( const void *buf, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm ); +extern int mca_pml_ob1_icsend( struct opal_convertor_t* convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_ob1_csend( struct opal_convertor_t* convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm ); + extern int mca_pml_ob1_irecv_init( void *buf, size_t count, ompi_datatype_t *datatype, @@ -201,6 +216,20 @@ extern int mca_pml_ob1_mrecv( void *buf, struct ompi_message_t **message, ompi_status_public_t* status ); +extern int mca_pml_ob1_icrecv( opal_convertor_t* convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_ob1_crecv( opal_convertor_t* convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + extern int mca_pml_ob1_dump( struct ompi_communicator_t* comm, int verbose ); diff --git a/ompi/mca/pml/ob1/pml_ob1_irecv.c b/ompi/mca/pml/ob1/pml_ob1_irecv.c index 37c0ce9e9e8..36162f2005e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_irecv.c +++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c @@ -362,3 +362,81 @@ mca_pml_ob1_mrecv( void *buf, return rc; } + +int mca_pml_ob1_icrecv(opal_convertor_t *convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + mca_pml_ob1_recv_request_t *recvreq; + MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq); + if (NULL == recvreq) + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, + convertor->pBaseBuf, + convertor->count, (ompi_datatype_t *)convertor->pDesc, src, tag, comm, false); + recvreq->req_recv.req_base.req_offset = convertor->bConverted; + // recvreq->req_recv.req_bytes_expected = *size; + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + // MCA_PML_OB1_RECV_REQUEST_START(recvreq); + mca_pml_ob1_recv_req_start_with_convertor(recvreq, convertor, *size); + *request = (ompi_request_t *) recvreq; + return OMPI_SUCCESS; +} + + +int mca_pml_ob1_crecv(opal_convertor_t *convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t *comm, + ompi_status_public_t * status) +{ + mca_pml_ob1_recv_request_t *recvreq = NULL; + int rc; + + if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) { + recvreq = mca_pml_ob1_recvreq; + mca_pml_ob1_recvreq = NULL; + } + + if( OPAL_UNLIKELY(NULL == recvreq) ) { + MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq); + if (NULL == recvreq) + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + +#if 0 + MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype, + src, tag, comm, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(recvreq->req_recv.req_base), + PERUSE_RECV); + + MCA_PML_OB1_RECV_REQUEST_START(recvreq); + ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); + + if (NULL != status) { /* return status */ + *status = recvreq->req_recv.req_base.req_ompi.req_status; + } + + rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + + if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_recvreq)) { + MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq); + } else { + mca_pml_ob1_recv_request_fini (recvreq); + mca_pml_ob1_recvreq = recvreq; + } +#endif + + return rc; +} diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 3a5b0c2d7a0..97687e1bb7f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -13,7 +13,7 @@ * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -277,3 +277,137 @@ int mca_pml_ob1_send(const void *buf, return rc; } + +int mca_pml_ob1_icsend(opal_convertor_t* convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm, + ompi_request_t ** request) +{ + mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst); + mca_pml_ob1_send_request_t *sendreq = NULL; + ompi_proc_t *dst_proc = ob1_proc->ompi_proc; + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc); + int16_t seqn; + int rc; + + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); + +#if 0 + if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { + rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, + endpoint, comm); + if (OPAL_LIKELY(0 <= rc)) { + /* NTH: it is legal to return ompi_request_empty since the only valid + * field in a send completion status is whether or not the send was + * cancelled (which it can't be at this point anyway). */ + *request = &ompi_request_empty; + return OMPI_SUCCESS; + } + } +#endif + + MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq); + if (NULL == sendreq) + return OMPI_ERR_OUT_OF_RESOURCE; + + MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, + convertor->pBaseBuf, + 0, + (ompi_datatype_t *)convertor->pDesc, + dst, tag, + comm, sendmode, false); + sendreq->req_send.req_base.req_offset = convertor->bConverted; + sendreq->req_send.req_base.req_count = convertor->count; + opal_convertor_clone(convertor, &sendreq->req_send.req_base.req_convertor, 1); + sendreq->req_send.req_bytes_packed = *size; + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); + *request = (ompi_request_t *) sendreq; + return rc; +} + +int mca_pml_ob1_csend(struct opal_convertor_t* convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + struct ompi_communicator_t* comm) + +{ + mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst); + ompi_proc_t *dst_proc = ob1_proc->ompi_proc; + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc); + mca_pml_ob1_send_request_t *sendreq = NULL; + int16_t seqn; + int rc; + +#if 0 + assert(buf == convertor->pBaseBuf); + assert(count == convertor->count); + assert(&(datatype->super) == convertor->pDesc); +#endif + + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + + assert (MCA_PML_BASE_SEND_BUFFERED != sendmode); + + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); + + if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) { + sendreq = mca_pml_ob1_sendreq; + mca_pml_ob1_sendreq = NULL; + } + + if( OPAL_UNLIKELY(NULL == sendreq) ) { + MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq); + if (NULL == sendreq) + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + sendreq->req_send.req_base.req_proc = dst_proc; + sendreq->rdma_frag = NULL; + + MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, + convertor->pBaseBuf, + 0, + (ompi_datatype_t *)convertor->pDesc, + dst, tag, + comm, sendmode, false); + sendreq->req_send.req_base.req_offset = convertor->bConverted; + sendreq->req_send.req_base.req_count = convertor->count; + opal_convertor_clone(convertor, &sendreq->req_send.req_base.req_convertor, 1); + sendreq->req_send.req_bytes_packed = *size; + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &sendreq->req_send.req_base, + PERUSE_SEND); + + MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc); + if (OPAL_LIKELY(rc == OMPI_SUCCESS)) { + ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); + + rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; + } + + if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_sendreq)) { + MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq); + } else { + mca_pml_ob1_send_request_fini (sendreq); + mca_pml_ob1_sendreq = sendreq; + } + + return rc; +} diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index bbc90e1e471..32b3ac97219 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -496,7 +496,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, sizeof(mca_pml_ob1_frag_hdr_t)); - data_offset = hdr->hdr_frag.hdr_frag_offset; + data_offset = hdr->hdr_frag.hdr_frag_offset + recvreq->req_recv.req_base.req_offset; /* * Make user buffer accessible(defined) before unpacking. @@ -1271,3 +1271,128 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) } } } + +void mca_pml_ob1_recv_req_start_with_convertor(mca_pml_ob1_recv_request_t *req, opal_convertor_t *convertor, size_t size) +{ + ompi_communicator_t *comm = req->req_recv.req_base.req_comm; + mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm; + mca_pml_ob1_comm_proc_t* proc; + mca_pml_ob1_recv_frag_t* frag; + opal_list_t *queue; + mca_pml_ob1_hdr_t* hdr; + + /* init/re-init the request */ + req->req_lock = 0; + req->req_pipeline_depth = 0; + req->req_bytes_received = 0; + req->req_bytes_expected = 0; + /* What about req_rdma_cnt ? */ + req->req_rdma_idx = 0; + req->req_pending = false; + req->req_ack_sent = false; + + MCA_PML_BASE_RECV_START(&req->req_recv.req_base); + + OB1_MATCHING_LOCK(&ob1_comm->matching_lock); + /** + * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include + * the cost of the request lock. + */ + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_BEGIN, + &(req->req_recv.req_base), PERUSE_RECV); + + /* assign sequence number */ + req->req_recv.req_base.req_sequence = ob1_comm->recv_sequence++; + + /* attempt to match posted recv */ + if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { + frag = recv_req_match_wild(req, &proc); + queue = &ob1_comm->wild_receives; +#if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT + /* As we are in a homogeneous environment we know that all remote + * architectures are exactly the same as the local one. Therefore, + * we can safely construct the convertor based on the proc + * information of rank 0. + */ + if( NULL == frag ) { + req->req_recv.req_base.req_proc = ompi_proc_local_proc; + prepare_recv_req_convertor(req, convertor, size); + } +#endif /* !OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + } else { + proc = mca_pml_ob1_peer_lookup (comm, req->req_recv.req_base.req_peer); + req->req_recv.req_base.req_proc = proc->ompi_proc; + frag = recv_req_match_specific_proc(req, proc); + queue = &proc->specific_receives; + /* wild cardrecv will be prepared on match */ + prepare_recv_req_convertor(req, convertor, size); + } + + if(OPAL_UNLIKELY(NULL == frag)) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + /* We didn't find any matches. Record this irecv so we can match + it when the message comes in. */ + append_recv_req_to_queue(queue, req); + req->req_match_received = false; + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); + } else { + if(OPAL_LIKELY(!IS_PROB_REQ(req))) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX, + &(req->req_recv.req_base), PERUSE_RECV); + + hdr = (mca_pml_ob1_hdr_t*)frag->segments->seg_addr.pval; + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_REMOVE_FROM_UNEX_Q, + req->req_recv.req_base.req_comm, + hdr->hdr_match.hdr_src, + hdr->hdr_match.hdr_tag, + PERUSE_RECV); + + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + + opal_list_remove_item(&proc->unexpected_frags, + (opal_list_item_t*)frag); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); + + switch(hdr->hdr_common.hdr_type) { + case MCA_PML_OB1_HDR_TYPE_MATCH: + mca_pml_ob1_recv_request_progress_match(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_OB1_HDR_TYPE_RNDV: + mca_pml_ob1_recv_request_progress_rndv(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_OB1_HDR_TYPE_RGET: + mca_pml_ob1_recv_request_progress_rget(req, frag->btl, frag->segments, + frag->num_segments); + break; + default: + assert(0); + } + + MCA_PML_OB1_RECV_FRAG_RETURN(frag); + + } else if (OPAL_UNLIKELY(IS_MPROB_REQ(req))) { + /* Remove the fragment from the match list, as it's now + matched. Stash it somewhere in the request (which, + yes, is a complete hack), where it will be plucked out + during the end of mprobe. The request will then be + "recreated" as a receive request, and the frag will be + restarted with this request during mrecv */ + opal_list_remove_item(&proc->unexpected_frags, + (opal_list_item_t*)frag); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); + + req->req_recv.req_base.req_addr = frag; + mca_pml_ob1_recv_request_matched_probe(req, frag->btl, + frag->segments, frag->num_segments); + + } else { + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); + mca_pml_ob1_recv_request_matched_probe(req, frag->btl, + frag->segments, frag->num_segments); + } + } +} diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 82c4767d834..f2d8c634e82 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ @@ -219,6 +219,7 @@ recv_request_pml_complete_check(mca_pml_ob1_recv_request_t *recvreq) } extern void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req); +extern void mca_pml_ob1_recv_req_start_with_convertor(mca_pml_ob1_recv_request_t *req, opal_convertor_t *convertor, size_t size); #define MCA_PML_OB1_RECV_REQUEST_START(r) mca_pml_ob1_recv_req_start(r) static inline void prepare_recv_req_converter(mca_pml_ob1_recv_request_t *req) @@ -236,6 +237,14 @@ static inline void prepare_recv_req_converter(mca_pml_ob1_recv_request_t *req) } } +static inline void prepare_recv_req_convertor(mca_pml_ob1_recv_request_t *req, opal_convertor_t *convertor, size_t size) +{ + if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) { + opal_convertor_clone(convertor, &req->req_recv.req_base.req_convertor, 1); + req->req_bytes_expected = size; + } +} + #define MCA_PML_OB1_RECV_REQUEST_MATCHED(request, hdr) \ recv_req_matched(request, hdr) diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index f358d733dab..eac167c6901 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -977,9 +977,10 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) } /* pack into a descriptor */ - offset = (size_t)range->range_send_offset; + offset = (size_t)range->range_send_offset + sendreq->req_send.req_base.req_offset; opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &offset); + offset -= sendreq->req_send.req_base.req_offset; range->range_send_offset = (uint64_t)offset; data_remaining = size; diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 5cb21f6aba6..b5b905e94f6 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -14,6 +14,8 @@ * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -473,6 +475,34 @@ mca_pml_ob1_send_request_start_seq (mca_pml_ob1_send_request_t* sendreq, mca_bml return OMPI_SUCCESS; } +static inline int +mca_pml_ob1_send_request_start_seq_size (mca_pml_ob1_send_request_t* sendreq, mca_bml_base_endpoint_t* endpoint, int32_t seqn, size_t *size) +{ + sendreq->req_endpoint = endpoint; + sendreq->req_state = 0; + sendreq->req_lock = 0; + sendreq->req_pipeline_depth = 0; + sendreq->req_bytes_delivered = 0; + sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE; + sendreq->req_send.req_base.req_sequence = seqn; + + MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); + + for(size_t i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) + return rc; + } + add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true); + + return OMPI_SUCCESS; +} + static inline int mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq ) { diff --git a/ompi/mca/pml/pml.h b/ompi/mca/pml/pml.h index 243b5993dda..7d938196480 100644 --- a/ompi/mca/pml/pml.h +++ b/ompi/mca/pml/pml.h @@ -13,7 +13,7 @@ * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -70,6 +70,7 @@ #include "mpi.h" /* needed for MPI_ANY_TAG */ #include "ompi/mca/pml/pml_constants.h" #include "ompi/request/request.h" +#include "opal/datatype/opal_convertor.h" BEGIN_C_DECLS @@ -276,6 +277,45 @@ typedef int (*mca_pml_base_module_mrecv_fn_t)( ompi_status_public_t* status ); +/** + * Post a convertor based receive request. + * + * @param convertor (INOUT)Convertor. + * @param size (INOUT) Max size to be sent. + * @param src (IN) Source rank w/in communicator. + * @param tag (IN) User defined tag. + * @param comm (IN) Communicator. + * @param request (OUT) Request handle. + * @return OMPI_SUCCESS or failure status. + */ +typedef int (*mca_pml_base_module_icrecv_fn_t)( + opal_convertor_t *convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request +); + +/** + * Post a convertor based receive and wait for completion. + * + * @param convertor (INOUT)Convertor. + * @param size (INOUT) Max size to be sent. + * @param src (IN) Source rank w/in communicator + * @param tag (IN) User defined tag + * @param comm (IN) Communicator + * @param status (OUT) Completion status + * @return OMPI_SUCCESS or failure status. + */ +typedef int (*mca_pml_base_module_crecv_fn_t)( + opal_convertor_t *convertor, + size_t *size, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status +); /** * Initialize a persistent send request. * @@ -348,6 +388,49 @@ typedef int (*mca_pml_base_module_send_fn_t)( struct ompi_communicator_t* comm ); +/** + * Post a convertor based send request. + * + * @param convertor (INOUT)Convertor. + * @param size (INOUT) Max size to be sent. + * @param dst (IN) Peer rank w/in communicator. + * @param tag (IN) User defined tag. + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @param comm (IN) Communicator. + * @param request (OUT) Request handle. + * @return OMPI_SUCCESS or failure status. + */ +typedef int (*mca_pml_base_module_icsend_fn_t)( + opal_convertor_t *convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request +); + + +/** + * Post a convertor based send request and wait for completion. + * + * @param convertor (INOUT)Convertor. + * @param size (INOUT) Max size to be sent. + * @param dst (IN) Peer rank w/in communicator. + * @param tag (IN) User defined tag. + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @param comm (IN) Communicator. + * @return OMPI_SUCCESS or failure status. + */ +typedef int (*mca_pml_base_module_csend_fn_t)( + opal_convertor_t *convertor, + size_t *size, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm +); + /** * Initiate one or more persistent requests. * @@ -502,9 +585,13 @@ struct mca_pml_base_module_1_0_1_t { mca_pml_base_module_irecv_init_fn_t pml_irecv_init; mca_pml_base_module_irecv_fn_t pml_irecv; mca_pml_base_module_recv_fn_t pml_recv; + mca_pml_base_module_icrecv_fn_t pml_icrecv; + mca_pml_base_module_crecv_fn_t pml_crecv; mca_pml_base_module_isend_init_fn_t pml_isend_init; mca_pml_base_module_isend_fn_t pml_isend; mca_pml_base_module_send_fn_t pml_send; + mca_pml_base_module_icsend_fn_t pml_icsend; + mca_pml_base_module_csend_fn_t pml_csend; mca_pml_base_module_iprobe_fn_t pml_iprobe; mca_pml_base_module_probe_fn_t pml_probe; mca_pml_base_module_start_fn_t pml_start; diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 26700ef0758..d522ce22ef6 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -3,6 +3,8 @@ * Copyright (c) 2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,9 +58,13 @@ mca_pml_ucx_module_t ompi_pml_ucx = { mca_pml_ucx_irecv_init, mca_pml_ucx_irecv, mca_pml_ucx_recv, + NULL, /* icrecv */ + NULL, /* crecv */ mca_pml_ucx_isend_init, mca_pml_ucx_isend, mca_pml_ucx_send, + NULL, /* icsend */ + NULL, /* csend */ mca_pml_ucx_iprobe, mca_pml_ucx_probe, mca_pml_ucx_start, diff --git a/ompi/mca/pml/yalla/pml_yalla.c b/ompi/mca/pml/yalla/pml_yalla.c index 4494ca1022d..f2f0b6eedd9 100644 --- a/ompi/mca/pml/yalla/pml_yalla.c +++ b/ompi/mca/pml/yalla/pml_yalla.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001-2011 Mellanox Technologies Ltd. ALL RIGHTS RESERVED. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -36,9 +36,13 @@ mca_pml_yalla_module_t ompi_pml_yalla = { mca_pml_yalla_irecv_init, mca_pml_yalla_irecv, mca_pml_yalla_recv, + NULL, /* icrecv */ + NULL, /* crecv */ mca_pml_yalla_isend_init, mca_pml_yalla_isend, mca_pml_yalla_send, + NULL, /* icsend */ + NULL, /* csend */ mca_pml_yalla_iprobe, mca_pml_yalla_probe, mca_pml_yalla_start, From fecc8e70d0cacffc11871bd24cd81b1d84dbf6d3 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 17 Jun 2016 08:32:48 +0900 Subject: [PATCH 2/8] mpiext/split: add OMPI_Split_send, OMPI_Isplit_send, OMPI_Split_recv and OMPI_Isplit_recv --- ompi/mpiext/split/Makefile.am | 25 ++++++ ompi/mpiext/split/c/Makefile.am | 71 ++++++++++++++++ ompi/mpiext/split/c/OMPI_Split_send.3in | 67 +++++++++++++++ ompi/mpiext/split/c/mpiext_isplit_recv.c | 103 +++++++++++++++++++++++ ompi/mpiext/split/c/mpiext_isplit_send.c | 101 ++++++++++++++++++++++ ompi/mpiext/split/c/mpiext_split_c.h | 24 ++++++ ompi/mpiext/split/c/mpiext_split_send.c | 101 ++++++++++++++++++++++ ompi/mpiext/split/c/profile/Makefile.am | 71 ++++++++++++++++ ompi/mpiext/split/configure.m4 | 28 ++++++ 9 files changed, 591 insertions(+) create mode 100644 ompi/mpiext/split/Makefile.am create mode 100644 ompi/mpiext/split/c/Makefile.am create mode 100644 ompi/mpiext/split/c/OMPI_Split_send.3in create mode 100644 ompi/mpiext/split/c/mpiext_isplit_recv.c create mode 100644 ompi/mpiext/split/c/mpiext_isplit_send.c create mode 100644 ompi/mpiext/split/c/mpiext_split_c.h create mode 100644 ompi/mpiext/split/c/mpiext_split_send.c create mode 100644 ompi/mpiext/split/c/profile/Makefile.am create mode 100644 ompi/mpiext/split/configure.m4 diff --git a/ompi/mpiext/split/Makefile.am b/ompi/mpiext/split/Makefile.am new file mode 100644 index 00000000000..c0c7c8a5c5f --- /dev/null +++ b/ompi/mpiext/split/Makefile.am @@ -0,0 +1,25 @@ +# +# Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2016 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This Makefile is not traversed during a normal "make all" in an OMPI +# build. It *is* traversed during "make dist", however. So you can +# put EXTRA_DIST targets in here. +# +# You can also use this as a convenience for building this MPI +# extension (i.e., "make all" in this directory to invoke "make all" +# in all the subdirectories). + +SUBDIRS = c + +EXTRA_DIST = README.txt diff --git a/ompi/mpiext/split/c/Makefile.am b/ompi/mpiext/split/c/Makefile.am new file mode 100644 index 00000000000..71f7468679b --- /dev/null +++ b/ompi/mpiext/split/c/Makefile.am @@ -0,0 +1,71 @@ +# +# Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2016 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +SUBDIRS = profile + +# We must set these #defines so that the inner OMPI MPI prototype +# header files do the Right Thing. +AM_CPPFLAGS = -DOMPI_PROFILE_LAYER=0 -DOMPI_COMPILING_FORTRAN_WRAPPERS=1 + +# OMPI_BUILD_MPI_PROFILING is enabled when we want our generated MPI_* symbols +# to be replaced by PMPI_*. +# In this directory, we need it to be 0 + +AM_CPPFLAGS += -DOMPI_BUILD_MPI_PROFILING=0 + +# This file builds the C bindings for MPI extensions. It must be +# present in all MPI extensions. + +# Example program +example: example.c + mpicc example.c -o example -g + +CLEANFILES = example + + +include $(top_srcdir)/Makefile.ompi-rules + +# Convenience libtool library that will be slurped up into libmpi.la. +noinst_LTLIBRARIES = libmpiext_split_c.la + +# This is where the top-level header file (that is included in +# ) must be installed. +ompidir = $(ompiincludedir)/ompi/mpiext/split/c + +# This is the header file that is installed. +ompi_HEADERS = mpiext_split_c.h + +# Sources for the convenience libtool library. Other than the one +# header file, all source files in the extension have no file naming +# conventions. +libmpiext_split_c_la_SOURCES = +if BUILD_MPI_BINDINGS_LAYER +libmpiext_split_c_la_SOURCES = += \ + $(ompi_HEADERS) \ + mpiext_isplit_recv.c + mpiext_isplit_send.c + mpiext_split_send.c +endif + +libmpiext_split_c_la_LDFLAGS = -module -avoid-version +libmpiext_split_c_la_LIBADD = profile/libpmpiext_split_c.la + +# Man page installation +nodist_man_MANS = OMPI_Split_send.3 + +# Man page sources +EXTRA_DIST = $(nodist_man_MANS:.3=.3in) example.c + +distclean-local: + rm -f $(nodist_man_MANS) diff --git a/ompi/mpiext/split/c/OMPI_Split_send.3in b/ompi/mpiext/split/c/OMPI_Split_send.3in new file mode 100644 index 00000000000..79b4320c039 --- /dev/null +++ b/ompi/mpiext/split/c/OMPI_Split_send.3in @@ -0,0 +1,67 @@ +.\" -*- nroff -*- +.\" Copyright 2013 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.\" Copyright (c) 2016 Research Organization for Information Science +.\" and Technology (RIST). All rights reserved. +.\" $COPYRIGHT$ +.TH OMPI_Split_send 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBOMPI_Split_send\fP \- Performs a standard-mode blocking send. + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int OMPI_Split_send(const void *\fIbuf\fP, int\fI count\fP, MPI_Datatype\fI datatype\fP, int\fI dest\fP, + int\fI tag\fP, MPI_Comm\fI comm\fP) + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +buf +Initial address of send buffer (choice). +.TP 1i +count +Number of elements send (nonnegative integer). +.TP 1i +datatype +Datatype of each send buffer element (handle). +.TP 1i +dest +Rank of destination (integer). +.TP 1i +tag +Message tag (integer). +.TP 1i +comm +Communicator (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +MPI_Split_send performs a standard-mode, blocking send. + +.SH NOTE +.ft R +This routine will block until the message is sent to the destination. For an in-depth explanation of the semantics of the standard-mode send, refer to the MPI-1 Standard. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. C++ functions do not return errors. If the default error handler is set to MPI::ERRORS_THROW_EXCEPTIONS, then on error the C++ exception mechanism will be used to throw an MPI::Exception object. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with MPI_Comm_set_errhandler; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +.ft R +.nf +MPI_Isplit_send + diff --git a/ompi/mpiext/split/c/mpiext_isplit_recv.c b/ompi/mpiext/split/c/mpiext_isplit_recv.c new file mode 100644 index 00000000000..390a4a735a3 --- /dev/null +++ b/ompi/mpiext/split/c/mpiext_isplit_recv.c @@ -0,0 +1,103 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/memchecker.h" + +#include "ompi/mpiext/split/c/mpiext_split_c.h" + +#if OMPI_BUILD_MPI_PROFILING +#if OPAL_HAVE_WEAK_SYMBOLS +#pragma weak OMPI_Isplit_recv = POMPI_Isplit_recv +#endif +#define OMPI_Isplit_recv POMPI_Isplit_recv +#endif + +static const char FUNC_NAME[] = "OMPI_Isplit_recv"; + + +int OMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, + int tag, MPI_Comm comm, MPI_Request *request) +{ + int rc = MPI_SUCCESS; + opal_convertor_t convertor; + size_t offset; + size_t size; + + MEMCHECKER( + memchecker_datatype(type); + memchecker_comm(comm); + ); + + if ( MPI_PARAM_CHECK ) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + OMPI_CHECK_DATATYPE_FOR_RECV(rc, type, count); + OMPI_CHECK_USER_BUFFER(rc, buf, type, count); + + if (ompi_comm_invalid(comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); + } else if (((tag < 0) && (tag != MPI_ANY_TAG)) || (tag > mca_pml.pml_max_tag)) { + rc = MPI_ERR_TAG; + } else if ((MPI_ANY_SOURCE != source) && + (MPI_PROC_NULL != source) && + ompi_comm_peer_invalid(comm, source)) { + rc = MPI_ERR_RANK; + } else if (NULL == request) { + rc = MPI_ERR_REQUEST; + } + OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME); + } + + if (source == MPI_PROC_NULL) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + + assert(count > 0); + + if (count > 0) { + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,source); + OBJ_RETAIN(type); + OBJ_RETAIN(type); + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &convertor ); + opal_convertor_get_unpacked_size( &convertor, &size ); + } + size = size / 2; + offset = 0; + opal_convertor_set_position(&convertor, &offset); + OPAL_CR_ENTER_LIBRARY(); + rc = MCA_PML_CALL(icrecv(&convertor, &size, source, tag, comm, request)); + if (OMPI_SUCCESS != rc) { + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); + } + offset += size; + opal_convertor_set_position(&convertor, &offset); + rc = MCA_PML_CALL(icrecv(&convertor, &size, source, tag, comm, request+1)); + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); +} diff --git a/ompi/mpiext/split/c/mpiext_isplit_send.c b/ompi/mpiext/split/c/mpiext_isplit_send.c new file mode 100644 index 00000000000..3c5a59f13e0 --- /dev/null +++ b/ompi/mpiext/split/c/mpiext_isplit_send.c @@ -0,0 +1,101 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/memchecker.h" + +#include "ompi/mpiext/split/c/mpiext_split_c.h" + +#if OMPI_BUILD_MPI_PROFILING +#if OPAL_HAVE_WEAK_SYMBOLS +#pragma weak OMPI_Isplit_send = POMPI_Isplit_send +#endif +#define OMPI_Isplit_send POMPI_Isplit_send +#endif + +static const char FUNC_NAME[] = "OMPI_Isplit_send"; + + +int OMPI_Isplit_send(const void *buf, int count, MPI_Datatype type, int dest, + int tag, MPI_Comm comm, MPI_Request *request) +{ + int rc = MPI_SUCCESS; + opal_convertor_t convertor; + size_t offset; + size_t size; + + MEMCHECKER( + memchecker_datatype(type); + memchecker_call(&opal_memchecker_base_isdefined, buf, count, type); + memchecker_comm(comm); + ); + + if ( MPI_PARAM_CHECK ) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + if (ompi_comm_invalid(comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); + } else if (count < 0) { + rc = MPI_ERR_COUNT; + } else if (tag < 0 || tag > mca_pml.pml_max_tag) { + rc = MPI_ERR_TAG; + } else if (ompi_comm_peer_invalid(comm, dest) && + (MPI_PROC_NULL != dest)) { + rc = MPI_ERR_RANK; + } else { + OMPI_CHECK_DATATYPE_FOR_SEND(rc, type, count); + OMPI_CHECK_USER_BUFFER(rc, buf, type, count); + } + OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == dest) { + return MPI_SUCCESS; + } + + assert(count > 0); + if (count > 0) { + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,dest); + OBJ_RETAIN(type); + OBJ_RETAIN(type); + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &convertor ); + opal_convertor_get_packed_size( &convertor, &size ); + } + size = size / 2; + offset = 0; + opal_convertor_set_position(&convertor, &offset); + OPAL_CR_ENTER_LIBRARY(); + rc = MCA_PML_CALL(icsend(&convertor, &size, dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request)); + if (OMPI_SUCCESS != rc) { + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); + } + offset += size; + opal_convertor_set_position(&convertor, &offset); + rc = MCA_PML_CALL(icsend(&convertor, &size, dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request+1)); + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); +} diff --git a/ompi/mpiext/split/c/mpiext_split_c.h b/ompi/mpiext/split/c/mpiext_split_c.h new file mode 100644 index 00000000000..923768235c9 --- /dev/null +++ b/ompi/mpiext/split/c/mpiext_split_c.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +OMPI_DECLSPEC int OMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, + int tag, MPI_Comm comm, MPI_Request *request); +OMPI_DECLSPEC int OMPI_Split_send(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm); +OMPI_DECLSPEC int OMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm, MPI_Request *req); + +OMPI_DECLSPEC int POMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, + int tag, MPI_Comm comm, MPI_Request *request); +OMPI_DECLSPEC int POMPI_Split_send(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm); +OMPI_DECLSPEC int POMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm, MPI_Request *req); diff --git a/ompi/mpiext/split/c/mpiext_split_send.c b/ompi/mpiext/split/c/mpiext_split_send.c new file mode 100644 index 00000000000..4a871868d35 --- /dev/null +++ b/ompi/mpiext/split/c/mpiext_split_send.c @@ -0,0 +1,101 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/memchecker.h" + +#include "ompi/mpiext/split/c/mpiext_split_c.h" + +#if OMPI_BUILD_MPI_PROFILING +#if OPAL_HAVE_WEAK_SYMBOLS +#pragma weak OMPI_Split_send = POMPI_Split_send +#endif +#define OMPI_Split_send POMPI_Split_send +#endif + +static const char FUNC_NAME[] = "OMPI_Split_send"; + + +int OMPI_Split_send(const void *buf, int count, MPI_Datatype type, int dest, + int tag, MPI_Comm comm) +{ + int rc = MPI_SUCCESS; + opal_convertor_t convertor; + size_t offset; + size_t size; + + MEMCHECKER( + memchecker_datatype(type); + memchecker_call(&opal_memchecker_base_isdefined, buf, count, type); + memchecker_comm(comm); + ); + + if ( MPI_PARAM_CHECK ) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + if (ompi_comm_invalid(comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); + } else if (count < 0) { + rc = MPI_ERR_COUNT; + } else if (tag < 0 || tag > mca_pml.pml_max_tag) { + rc = MPI_ERR_TAG; + } else if (ompi_comm_peer_invalid(comm, dest) && + (MPI_PROC_NULL != dest)) { + rc = MPI_ERR_RANK; + } else { + OMPI_CHECK_DATATYPE_FOR_SEND(rc, type, count); + OMPI_CHECK_USER_BUFFER(rc, buf, type, count); + } + OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == dest) { + return MPI_SUCCESS; + } + + assert(count > 0); + if (count > 0) { + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,dest); + OBJ_RETAIN(type); + OBJ_RETAIN(type); + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &convertor ); + opal_convertor_get_packed_size( &convertor, &size ); + } + size = size / 2; + offset = 0; + opal_convertor_set_position(&convertor, &offset); + OPAL_CR_ENTER_LIBRARY(); + rc = MCA_PML_CALL(csend(&convertor, &size, dest, tag, MCA_PML_BASE_SEND_STANDARD, comm)); + if (OMPI_SUCCESS != rc) { + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); + } + offset += size; + opal_convertor_set_position(&convertor, &offset); + rc = MCA_PML_CALL(csend(&convertor, &size, dest, tag, MCA_PML_BASE_SEND_STANDARD, comm)); + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); +} diff --git a/ompi/mpiext/split/c/profile/Makefile.am b/ompi/mpiext/split/c/profile/Makefile.am new file mode 100644 index 00000000000..99f5b174b8c --- /dev/null +++ b/ompi/mpiext/split/c/profile/Makefile.am @@ -0,0 +1,71 @@ +# -*- makefile -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2013 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved. +# Copyright (c) 2012-2013 Inria. All rights reserved. +# Copyright (c) 2013 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2015-2016 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +include $(top_srcdir)/Makefile.ompi-rules + +# If OMPI_BUILD_MPI_PROFILING is enabled when we want our generated MPI_* symbols +# to be replaced by PMPI_*. +# In this directory, we definately need it to be 1. + +AM_CPPFLAGS = -DOMPI_BUILD_MPI_PROFILING=1 + +# +# This build needs to go through only if profiling is required. +# Further, this build HAS to go through if profiling is required. +# + +noinst_LTLIBRARIES = libpmpiext_split_c.la + +# This is where the top-level header file (that is included in +# ) must be installed. +ompidir = $(ompiincludedir)/ompi/mpiext/split/c + +# This is the header file that is installed. + +nodist_libpmpiext_split_c_la_SOURCES = \ + pmpiext_isplit_recv.c \ + pmpiext_isplit_send.c \ + pmpiext_split_send.c + +# +# Sym link in the sources from the real MPI directory +# +$(nodist_libpmpiext_split_c_la_SOURCES): + $(OMPI_V_LN_S) if test ! -r $@ ; then \ + pname=`echo $@ | cut -b '2-'` ; \ + $(LN_S) $(top_srcdir)/ompi/mpiext/split/c/$$pname $@ ; \ + fi + +MAINTAINERCLEANFILES = $(nodist_libpmpiext_split_c_la_SOURCES) + +# Don't want these targets in here + +tags-recursive: +tags: +TAGS: +GTAGS: +ID: diff --git a/ompi/mpiext/split/configure.m4 b/ompi/mpiext/split/configure.m4 new file mode 100644 index 00000000000..5c8ec4c34ca --- /dev/null +++ b/ompi/mpiext/split/configure.m4 @@ -0,0 +1,28 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2009 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2016 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_MPIEXT_split_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([OMPI_MPIEXT_split_CONFIG], [ + AC_CONFIG_FILES([ompi/mpiext/split/Makefile]) + AC_CONFIG_FILES([ompi/mpiext/split/c/Makefile]) + AC_CONFIG_FILES([ompi/mpiext/split/c/profile/Makefile]) + + # This example can always build, so we just execute $1 if it was + # requested. + AS_IF([test "$ENABLE_split" = "1" || \ + test "$ENABLE_EXT_ALL" = "1"], + [$1], + [$2]) +]) From c3d61ed168109e8330f837d881cec776c3c7e61b Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 13 Sep 2017 10:23:21 +0900 Subject: [PATCH 3/8] coll/base/bcast --- ompi/mca/coll/base/coll_base_bcast.c | 254 +++++++++++++++++++++-- ompi/mca/coll/base/coll_base_functions.h | 3 +- 2 files changed, 243 insertions(+), 14 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 38210bab9df..942448d35be 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -225,6 +225,242 @@ ompi_coll_base_bcast_intra_generic( void* buffer, return err; } +int +ompi_coll_base_bcast_intra_generic2( void* buffer, + int count, + struct ompi_datatype_t* datatype, + int root, + struct ompi_communicator_t* comm, + mca_coll_base_module_t *module, + size_t segment_size, + ompi_coll_tree_t* tree ) +{ + int err = 0, line, i, rank, segindex, req_index; + opal_convertor_t send_convertors[2], recv_convertors[2]; + char *tmpbuf; + size_t offset = 0; + size_t next_offset; + size_t size; + size_t remaining; + size_t total_size; + int scindex = 0; + ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; + ompi_request_t **send_reqs = NULL; + ompi_datatype_size(datatype, &remaining); + remaining *= count; + +#if OPAL_ENABLE_DEBUG + assert( ompi_comm_size(comm) > 1 ); +#endif + rank = ompi_comm_rank(comm); + +#if 1 + /* FIXME OBJ_RETAIN(datatype) ? */ + OBJ_CONSTRUCT(&send_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&send_convertors[1], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[1], opal_convertor_t); + send_convertors[0].stack_pos = -1; + send_convertors[1].stack_pos = -1; + recv_convertors[0].stack_pos = -1; + recv_convertors[1].stack_pos = -1; + if( tree->tree_nextsize != 0 ) { + send_reqs = coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize); + if( NULL == send_reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } + } + + /* Root code */ + if( rank == root ) { + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[i]); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &send_convertors[0] ); + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &send_convertors[1] ); + opal_convertor_set_position(&convertor, &offset); + while (remaining) { + next_offset = offset + (segment_sizetree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[i])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + offset = next_offset; + remaining -= size; + i++; sc_index ^= 1; + } + /* + For each segment: + - send segment to all children. + The last segment may have less elements than other segments. + */ + sendcount = count_by_segment; + for( segindex = 0; segindex < num_segments; segindex++ ) { + if( segindex == (num_segments - 1) ) { + sendcount = original_count - segindex * count_by_segment; + } + for( i = 0; i < tree->tree_nextsize; i++ ) { + err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[i])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + /* complete the sends before starting the next sends */ + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + MPI_STATUSES_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + /* update tmp buffer */ + tmpbuf += realsegsize; + + } + } + + /* Intermediate nodes code */ + else if( tree->tree_nextsize > 0 ) { + /* + Create the pipeline. + 1) Post the first receive + 2) For segments 1 .. num_segments + - post new receive + - wait on the previous receive to complete + - send this data to children + 3) Wait on the last segment + 4) Compute number of elements in last segment. + 5) Send the last segment to children + */ + req_index = 0; + err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[req_index])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + for( segindex = 1; segindex < num_segments; segindex++ ) { + + req_index = req_index ^ 0x1; + + /* post new irecv */ + err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment, + datatype, tree->tree_prev, + MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[req_index])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + /* wait for and forward the previous segment to children */ + err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], + MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + for( i = 0; i < tree->tree_nextsize; i++ ) { + err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype, + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[i])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + /* complete the sends before starting the next iteration */ + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + MPI_STATUSES_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + /* Update the receive buffer */ + tmpbuf += realsegsize; + + } + + /* Process the last segment */ + err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment; + for( i = 0; i < tree->tree_nextsize; i++ ) { + err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[i])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, + MPI_STATUSES_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + /* Leaf nodes */ + else { + /* + Receive all segments from parent in a loop: + 1) post irecv for the first segment + 2) for segments 1 .. num_segments + - post irecv for the next segment + - wait on the previous segment to arrive + 3) wait for the last segment + */ + req_index = 0; + err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[req_index])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + for( segindex = 1; segindex < num_segments; segindex++ ) { + req_index = req_index ^ 0x1; + tmpbuf += realsegsize; + /* post receive for the next segment */ + err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[req_index])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + /* wait on the previous segment */ + err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], + MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + return (MPI_SUCCESS); + + error_hndl: + OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank) ); + (void)line; // silence compiler warnings + ompi_coll_base_free_reqs( recv_reqs, 2); + if( NULL != send_reqs ) { + ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); + } +#endif + + return err; +} + int ompi_coll_base_bcast_intra_bintree ( void* buffer, int count, @@ -262,23 +498,15 @@ ompi_coll_base_bcast_intra_pipeline( void* buffer, mca_coll_base_module_t *module, uint32_t segsize ) { - int segcount = count; - size_t typelng; mca_coll_base_comm_t *data = module->base_data; COLL_BASE_UPDATE_PIPELINE( comm, module, root ); - /** - * Determine number of elements sent per operation. - */ - ompi_datatype_type_size( datatype, &typelng ); - COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d", + ompi_comm_rank(comm), segsize)); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d", - ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); - - return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, - segcount, data->cached_pipeline ); + return ompi_coll_base_bcast_intra_generic2( buffer, count, datatype, root, comm, module, + segsize, data->cached_pipeline ); } int diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 9e81e2bd182..6e8875fcc3a 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -14,7 +14,7 @@ * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. @@ -214,6 +214,7 @@ int ompi_coll_base_barrier_intra_basic_linear(BARRIER_ARGS); /* Bcast */ int ompi_coll_base_bcast_intra_generic(BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree); +int ompi_coll_base_bcast_intra_generic2(BCAST_ARGS, size_t segment_size, ompi_coll_tree_t* tree); int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS); int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains); int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize); From b36dfa5c7bdef4fde35337c362892718253a3920 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 17 Jun 2016 17:29:31 +0900 Subject: [PATCH 4/8] checkpoint --- ompi/mca/coll/base/coll_base_bcast.c | 110 +++++++++++++++-------- ompi/mca/pml/ob1/pml_ob1_irecv.c | 33 ++++++- ompi/mca/pml/ob1/pml_ob1_isend.c | 2 + ompi/mca/pml/ob1/pml_ob1_recvreq.c | 6 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 3 +- ompi/mpiext/split/c/mpiext_isplit_recv.c | 2 - ompi/mpiext/split/c/mpiext_isplit_send.c | 2 - ompi/mpiext/split/c/mpiext_split_c.h | 4 + ompi/mpiext/split/c/mpiext_split_send.c | 2 - 9 files changed, 115 insertions(+), 49 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 942448d35be..0afc631609d 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -235,18 +235,16 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, size_t segment_size, ompi_coll_tree_t* tree ) { - int err = 0, line, i, rank, segindex, req_index; + int err = 0, line, i = 0, rank, req_index; opal_convertor_t send_convertors[2], recv_convertors[2]; - char *tmpbuf; size_t offset = 0; size_t next_offset; size_t size; size_t remaining; - size_t total_size; - int scindex = 0; + int sc_index = 0, rc_index = 0; ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; ompi_request_t **send_reqs = NULL; - ompi_datatype_size(datatype, &remaining); + ompi_datatype_type_size(datatype, &remaining); remaining *= count; #if OPAL_ENABLE_DEBUG @@ -276,19 +274,19 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, /* remote architecture and prepared with the type. */ opal_convertor_copy_and_prepare_for_send( proc->super.proc_convertor, - &(type->super), + &(datatype->super), count, - buf, + buffer, 0, &send_convertors[0] ); opal_convertor_copy_and_prepare_for_send( proc->super.proc_convertor, - &(type->super), + &(datatype->super), count, - buf, + buffer, 0, &send_convertors[1] ); - opal_convertor_set_position(&convertor, &offset); + opal_convertor_set_position(&send_convertors[0], &offset); while (remaining) { next_offset = offset + (segment_sizetree_next[i], - MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, - &send_reqs[i])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - offset = next_offset; - remaining -= size; - i++; sc_index ^= 1; - } - /* - For each segment: - - send segment to all children. - The last segment may have less elements than other segments. - */ - sendcount = count_by_segment; - for( segindex = 0; segindex < num_segments; segindex++ ) { - if( segindex == (num_segments - 1) ) { - sendcount = original_count - segindex * count_by_segment; - } + for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, - tree->tree_next[i], - MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, - &send_reqs[i])); + err = MCA_PML_CALL(icsend(&send_convertors[sc_index], + &size, + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[i])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } - /* complete the sends before starting the next sends */ err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - /* update tmp buffer */ - tmpbuf += realsegsize; - + offset = next_offset; + remaining -= size; + sc_index ^= 1; } } /* Intermediate nodes code */ else if( tree->tree_nextsize > 0 ) { +#if 0 /* Create the pipeline. 1) Post the first receive @@ -410,10 +390,61 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } +#else + assert(0); +#endif } /* Leaf nodes */ else { +#if 1 + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_prev); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[0] ); + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[1] ); + opal_convertor_set_position(&recv_convertors[0], &offset); + while (remaining) { + next_offset = offset + (segment_sizetree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[rc_index])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + offset = next_offset; + remaining -= size; + rc_index ^= 1; + /* wait on the previous segment */ + err = ompi_request_wait( &recv_reqs[rc_index], + MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + + err = ompi_request_wait( &recv_reqs[rc_index^1], MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } +#else /* Receive all segments from parent in a loop: 1) post irecv for the first segment @@ -444,6 +475,7 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } +#endif } return (MPI_SUCCESS); diff --git a/ompi/mca/pml/ob1/pml_ob1_irecv.c b/ompi/mca/pml/ob1/pml_ob1_irecv.c index 36162f2005e..41413137c70 100644 --- a/ompi/mca/pml/ob1/pml_ob1_irecv.c +++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c @@ -375,9 +375,10 @@ int mca_pml_ob1_icrecv(opal_convertor_t *convertor, if (NULL == recvreq) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + OBJ_RETAIN(convertor->pDesc); MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, - convertor->pBaseBuf, - convertor->count, (ompi_datatype_t *)convertor->pDesc, src, tag, comm, false); + convertor->pBaseBuf, + convertor->count, (ompi_datatype_t *)convertor->pDesc, src, tag, comm, false); recvreq->req_recv.req_base.req_offset = convertor->bConverted; // recvreq->req_recv.req_bytes_expected = *size; @@ -430,6 +431,34 @@ int mca_pml_ob1_crecv(opal_convertor_t *convertor, rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_recvreq)) { + MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq); + } else { + mca_pml_ob1_recv_request_fini (recvreq); + mca_pml_ob1_recvreq = recvreq; + } +#else + OBJ_RETAIN(convertor->pDesc); + MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, + convertor->pBaseBuf, + convertor->count, (ompi_datatype_t *)convertor->pDesc, src, tag, comm, false); + recvreq->req_recv.req_base.req_offset = convertor->bConverted; + // recvreq->req_recv.req_bytes_expected = *size; + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + // MCA_PML_OB1_RECV_REQUEST_START(recvreq); + mca_pml_ob1_recv_req_start_with_convertor(recvreq, convertor, *size); + ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); + + if (NULL != status) { /* return status */ + *status = recvreq->req_recv.req_base.req_ompi.req_status; + } + + rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_recvreq)) { MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq); } else { diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 97687e1bb7f..df2a02d57bf 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -317,6 +317,7 @@ int mca_pml_ob1_icsend(opal_convertor_t* convertor, if (NULL == sendreq) return OMPI_ERR_OUT_OF_RESOURCE; + OBJ_RETAIN(convertor->pDesc); MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, convertor->pBaseBuf, 0, @@ -380,6 +381,7 @@ int mca_pml_ob1_csend(struct opal_convertor_t* convertor, sendreq->req_send.req_base.req_proc = dst_proc; sendreq->rdma_frag = NULL; + OBJ_RETAIN(convertor->pDesc); MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, convertor->pBaseBuf, 0, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 32b3ac97219..071e3b66854 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -735,8 +735,10 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq /* updating the write location */ OPAL_THREAD_LOCK(&recvreq->lock); + offset += recvreq->req_recv.req_base.req_offset; opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address); + offset -= recvreq->req_recv.req_base.req_offset; OPAL_THREAD_UNLOCK(&recvreq->lock); frag->rdma_bml = rdma_bml; @@ -779,7 +781,7 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq { size_t bytes_received = 0; size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */ - size_t data_offset = 0; + size_t data_offset = recvreq->req_recv.req_base.req_offset; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, @@ -998,9 +1000,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, /* take lock to protect convertor against concurrent access * from unpack */ OPAL_THREAD_LOCK(&recvreq->lock); + recvreq->req_rdma_offset += recvreq->req_recv.req_base.req_offset; opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor, &recvreq->req_rdma_offset); opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr); + recvreq->req_rdma_offset -= recvreq->req_recv.req_base.req_offset; OPAL_THREAD_UNLOCK(&recvreq->lock); if (btl->btl_register_mem) { diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index eac167c6901..1870615b425 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -1235,7 +1235,8 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle * non-contiguous RDMA. If that changes this code will be wrong. */ opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor, - hdr->hdr_rdma_offset, &frag->local_address); + hdr->hdr_rdma_offset+sendreq->req_send.req_base.req_offset, + &frag->local_address); mca_pml_ob1_send_request_put_frag(frag); } diff --git a/ompi/mpiext/split/c/mpiext_isplit_recv.c b/ompi/mpiext/split/c/mpiext_isplit_recv.c index 390a4a735a3..dd4ae4a0bff 100644 --- a/ompi/mpiext/split/c/mpiext_isplit_recv.c +++ b/ompi/mpiext/split/c/mpiext_isplit_recv.c @@ -73,8 +73,6 @@ int OMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, if (count > 0) { ompi_proc_t* proc = ompi_comm_peer_lookup(comm,source); - OBJ_RETAIN(type); - OBJ_RETAIN(type); OBJ_CONSTRUCT(&convertor, opal_convertor_t); convertor.stack_pos = -1; /* We will create a convertor specialized for the */ diff --git a/ompi/mpiext/split/c/mpiext_isplit_send.c b/ompi/mpiext/split/c/mpiext_isplit_send.c index 3c5a59f13e0..63cd9c05bcf 100644 --- a/ompi/mpiext/split/c/mpiext_isplit_send.c +++ b/ompi/mpiext/split/c/mpiext_isplit_send.c @@ -71,8 +71,6 @@ int OMPI_Isplit_send(const void *buf, int count, MPI_Datatype type, int dest, assert(count > 0); if (count > 0) { ompi_proc_t* proc = ompi_comm_peer_lookup(comm,dest); - OBJ_RETAIN(type); - OBJ_RETAIN(type); OBJ_CONSTRUCT(&convertor, opal_convertor_t); convertor.stack_pos = -1; /* We will create a convertor specialized for the */ diff --git a/ompi/mpiext/split/c/mpiext_split_c.h b/ompi/mpiext/split/c/mpiext_split_c.h index 923768235c9..1d7acb57b7e 100644 --- a/ompi/mpiext/split/c/mpiext_split_c.h +++ b/ompi/mpiext/split/c/mpiext_split_c.h @@ -15,6 +15,8 @@ OMPI_DECLSPEC int OMPI_Split_send(const void *buf, int count, MPI_Datatype data int tag, MPI_Comm comm); OMPI_DECLSPEC int OMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *req); +OMPI_DECLSPEC int OMPI_Split_recv(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int POMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, int tag, MPI_Comm comm, MPI_Request *request); @@ -22,3 +24,5 @@ OMPI_DECLSPEC int POMPI_Split_send(const void *buf, int count, MPI_Datatype dat int tag, MPI_Comm comm); OMPI_DECLSPEC int POMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *req); +OMPI_DECLSPEC int POMPI_Split_recv(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm, MPI_Status *status); diff --git a/ompi/mpiext/split/c/mpiext_split_send.c b/ompi/mpiext/split/c/mpiext_split_send.c index 4a871868d35..8750f5d1122 100644 --- a/ompi/mpiext/split/c/mpiext_split_send.c +++ b/ompi/mpiext/split/c/mpiext_split_send.c @@ -71,8 +71,6 @@ int OMPI_Split_send(const void *buf, int count, MPI_Datatype type, int dest, assert(count > 0); if (count > 0) { ompi_proc_t* proc = ompi_comm_peer_lookup(comm,dest); - OBJ_RETAIN(type); - OBJ_RETAIN(type); OBJ_CONSTRUCT(&convertor, opal_convertor_t); convertor.stack_pos = -1; /* We will create a convertor specialized for the */ From 948a329d4b8f2162b217ff1f0923cd9c8cec8922 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Sat, 18 Jun 2016 11:12:17 +0900 Subject: [PATCH 5/8] ompiext/split: fix misc typos --- ompi/mpiext/split/README.txt | 0 ompi/mpiext/split/c/Makefile.am | 15 ++++----------- 2 files changed, 4 insertions(+), 11 deletions(-) create mode 100644 ompi/mpiext/split/README.txt diff --git a/ompi/mpiext/split/README.txt b/ompi/mpiext/split/README.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ompi/mpiext/split/c/Makefile.am b/ompi/mpiext/split/c/Makefile.am index 71f7468679b..e1f2d75bf51 100644 --- a/ompi/mpiext/split/c/Makefile.am +++ b/ompi/mpiext/split/c/Makefile.am @@ -27,13 +27,6 @@ AM_CPPFLAGS += -DOMPI_BUILD_MPI_PROFILING=0 # This file builds the C bindings for MPI extensions. It must be # present in all MPI extensions. -# Example program -example: example.c - mpicc example.c -o example -g - -CLEANFILES = example - - include $(top_srcdir)/Makefile.ompi-rules # Convenience libtool library that will be slurped up into libmpi.la. @@ -51,10 +44,10 @@ ompi_HEADERS = mpiext_split_c.h # conventions. libmpiext_split_c_la_SOURCES = if BUILD_MPI_BINDINGS_LAYER -libmpiext_split_c_la_SOURCES = += \ +libmpiext_split_c_la_SOURCES += \ $(ompi_HEADERS) \ - mpiext_isplit_recv.c - mpiext_isplit_send.c + mpiext_isplit_recv.c \ + mpiext_isplit_send.c \ mpiext_split_send.c endif @@ -65,7 +58,7 @@ libmpiext_split_c_la_LIBADD = profile/libpmpiext_split_c.la nodist_man_MANS = OMPI_Split_send.3 # Man page sources -EXTRA_DIST = $(nodist_man_MANS:.3=.3in) example.c +EXTRA_DIST = $(nodist_man_MANS:.3=.3in) distclean-local: rm -f $(nodist_man_MANS) From b908405f2ce21d6b50839cf298d94ec7716b28c9 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 13 Sep 2017 10:27:41 +0900 Subject: [PATCH 6/8] checkpoint --- ompi/mca/coll/base/coll_base_bcast.c | 62 +++++---------- ompi/mca/coll/base/coll_base_functions.h | 3 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 11 ++- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 9 ++- ompi/mpiext/split/c/Makefile.am | 1 + ompi/mpiext/split/c/mpiext_isplit_recv.c | 2 + ompi/mpiext/split/c/mpiext_split_c.h | 10 +-- ompi/mpiext/split/c/mpiext_split_recv.c | 99 ++++++++++++++++++++++++ ompi/mpiext/split/c/profile/Makefile.am | 1 + 9 files changed, 140 insertions(+), 58 deletions(-) create mode 100644 ompi/mpiext/split/c/mpiext_split_recv.c diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 0afc631609d..3b99573d7ce 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -36,13 +36,13 @@ int ompi_coll_base_bcast_intra_generic( void* buffer, - int original_count, - struct ompi_datatype_t* datatype, - int root, - struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, - uint32_t count_by_segment, - ompi_coll_tree_t* tree ) + int count, + struct ompi_datatype_t* datatype, + int root, + struct ompi_communicator_t* comm, + mca_coll_base_module_t *module, + size_t segment_size, + ompi_coll_tree_t* tree ) { int err = 0, line, i, rank, segindex, req_index; int num_segments; /* Number of segments */ @@ -235,7 +235,7 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, size_t segment_size, ompi_coll_tree_t* tree ) { - int err = 0, line, i = 0, rank, req_index; + int err = 0, line, i = 0, rank; opal_convertor_t send_convertors[2], recv_convertors[2]; size_t offset = 0; size_t next_offset; @@ -502,23 +502,15 @@ ompi_coll_base_bcast_intra_bintree ( void* buffer, mca_coll_base_module_t *module, uint32_t segsize ) { - int segcount = count; - size_t typelng; mca_coll_base_comm_t *data = module->base_data; COLL_BASE_UPDATE_BINTREE( comm, module, root ); - /** - * Determine number of elements sent per operation. - */ - ompi_datatype_type_size( datatype, &typelng ); - COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - - OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d", - ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d", + ompi_comm_rank(comm), segsize)); return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, - segcount, data->cached_bintree ); + segsize, data->cached_bintree ); } int @@ -537,8 +529,8 @@ ompi_coll_base_bcast_intra_pipeline( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d", ompi_comm_rank(comm), segsize)); - return ompi_coll_base_bcast_intra_generic2( buffer, count, datatype, root, comm, module, - segsize, data->cached_pipeline ); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, + segsize, data->cached_pipeline ); } int @@ -550,23 +542,15 @@ ompi_coll_base_bcast_intra_chain( void* buffer, mca_coll_base_module_t *module, uint32_t segsize, int32_t chains ) { - int segcount = count; - size_t typelng; mca_coll_base_comm_t *data = module->base_data; COLL_BASE_UPDATE_CHAIN( comm, module, root, chains ); - /** - * Determine number of elements sent per operation. - */ - ompi_datatype_type_size( datatype, &typelng ); - COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - - OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d", - ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d", + ompi_comm_rank(comm), chains, segsize)); return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, - segcount, data->cached_chain ); + segsize, data->cached_chain ); } int @@ -578,23 +562,15 @@ ompi_coll_base_bcast_intra_binomial( void* buffer, mca_coll_base_module_t *module, uint32_t segsize ) { - int segcount = count; - size_t typelng; mca_coll_base_comm_t *data = module->base_data; COLL_BASE_UPDATE_BMTREE( comm, module, root ); - /** - * Determine number of elements sent per operation. - */ - ompi_datatype_type_size( datatype, &typelng ); - COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); - - OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", - ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d", + ompi_comm_rank(comm), segsize)); return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, - segcount, data->cached_bmtree ); + segsize, data->cached_bmtree ); } int diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 6e8875fcc3a..cd1cd30901a 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -213,8 +213,7 @@ int ompi_coll_base_barrier_intra_tree(BARRIER_ARGS); int ompi_coll_base_barrier_intra_basic_linear(BARRIER_ARGS); /* Bcast */ -int ompi_coll_base_bcast_intra_generic(BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree); -int ompi_coll_base_bcast_intra_generic2(BCAST_ARGS, size_t segment_size, ompi_coll_tree_t* tree); +int ompi_coll_base_bcast_intra_generic(BCAST_ARGS, size_t segment_size, ompi_coll_tree_t* tree); int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS); int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains); int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize); diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 071e3b66854..34c9ecaca11 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -694,7 +694,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq #endif /* OPAL_CUDA_GDR_SUPPORT */ - offset = 0; + offset = recvreq->req_recv.req_base.req_offset; OPAL_THREAD_LOCK(&recvreq->lock); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); @@ -731,14 +731,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size); /* update the read location */ - frag->remote_address = hdr->hdr_src_ptr + offset; + frag->remote_address = hdr->hdr_src_ptr + offset - recvreq->req_recv.req_base.req_offset; /* updating the write location */ OPAL_THREAD_LOCK(&recvreq->lock); - offset += recvreq->req_recv.req_base.req_offset; opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address); - offset -= recvreq->req_recv.req_base.req_offset; OPAL_THREAD_UNLOCK(&recvreq->lock); frag->rdma_bml = rdma_bml; @@ -847,7 +845,8 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre mca_btl_base_segment_t* segments, size_t num_segments ) { - size_t bytes_received, data_offset = 0; + // size_t bytes_received, data_offset = 0; + size_t bytes_received, data_offset = recvreq->req_recv.req_base.req_offset; size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */ mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; @@ -1295,7 +1294,7 @@ void mca_pml_ob1_recv_req_start_with_convertor(mca_pml_ob1_recv_request_t *req, req->req_pending = false; req->req_ack_sent = false; - MCA_PML_BASE_RECV_START(&req->req_recv.req_base); + MCA_PML_BASE_RECV_START(&req->req_recv); OB1_MATCHING_LOCK(&ob1_comm->matching_lock); /** diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index b5b905e94f6..fc3401e60a9 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -159,7 +159,12 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type) } #define MCA_PML_OB1_SEND_REQUEST_RESET(sendreq) \ - MCA_PML_BASE_SEND_REQUEST_RESET(&(sendreq)->req_send) + if ((sendreq)->req_send.req_bytes_packed > 0) { \ + size_t _position = sendreq->req_send.req_base.req_offset; \ + opal_convertor_set_position(&(sendreq)->req_send.req_base.req_convertor, \ + &_position); \ + assert( sendreq->req_send.req_base.req_offset == _position ); \ + } static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq) { @@ -486,7 +491,7 @@ mca_pml_ob1_send_request_start_seq_size (mca_pml_ob1_send_request_t* sendreq, mc sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE; sendreq->req_send.req_base.req_sequence = seqn; - MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); + MCA_PML_BASE_SEND_START( &sendreq->req_send ); for(size_t i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { mca_bml_base_btl_t* bml_btl; diff --git a/ompi/mpiext/split/c/Makefile.am b/ompi/mpiext/split/c/Makefile.am index e1f2d75bf51..0bb06319dd7 100644 --- a/ompi/mpiext/split/c/Makefile.am +++ b/ompi/mpiext/split/c/Makefile.am @@ -48,6 +48,7 @@ libmpiext_split_c_la_SOURCES += \ $(ompi_HEADERS) \ mpiext_isplit_recv.c \ mpiext_isplit_send.c \ + mpiext_split_recv.c \ mpiext_split_send.c endif diff --git a/ompi/mpiext/split/c/mpiext_isplit_recv.c b/ompi/mpiext/split/c/mpiext_isplit_recv.c index dd4ae4a0bff..9182e5452b5 100644 --- a/ompi/mpiext/split/c/mpiext_isplit_recv.c +++ b/ompi/mpiext/split/c/mpiext_isplit_recv.c @@ -88,12 +88,14 @@ int OMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, } size = size / 2; offset = 0; +#if 0 opal_convertor_set_position(&convertor, &offset); OPAL_CR_ENTER_LIBRARY(); rc = MCA_PML_CALL(icrecv(&convertor, &size, source, tag, comm, request)); if (OMPI_SUCCESS != rc) { OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); } +#endif offset += size; opal_convertor_set_position(&convertor, &offset); rc = MCA_PML_CALL(icrecv(&convertor, &size, source, tag, comm, request+1)); diff --git a/ompi/mpiext/split/c/mpiext_split_c.h b/ompi/mpiext/split/c/mpiext_split_c.h index 1d7acb57b7e..cb78966b184 100644 --- a/ompi/mpiext/split/c/mpiext_split_c.h +++ b/ompi/mpiext/split/c/mpiext_split_c.h @@ -13,16 +13,16 @@ OMPI_DECLSPEC int OMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int int tag, MPI_Comm comm, MPI_Request *request); OMPI_DECLSPEC int OMPI_Split_send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); +OMPI_DECLSPEC int OMPI_Split_recv(void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int OMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *req); -OMPI_DECLSPEC int OMPI_Split_recv(const void *buf, int count, MPI_Datatype datatype, int dest, - int tag, MPI_Comm comm, MPI_Status *status); OMPI_DECLSPEC int POMPI_Isplit_recv(void *buf, int count, MPI_Datatype type, int source, int tag, MPI_Comm comm, MPI_Request *request); -OMPI_DECLSPEC int POMPI_Split_send(const void *buf, int count, MPI_Datatype datatype, int dest, - int tag, MPI_Comm comm); OMPI_DECLSPEC int POMPI_Isplit_send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *req); -OMPI_DECLSPEC int POMPI_Split_recv(const void *buf, int count, MPI_Datatype datatype, int dest, +OMPI_DECLSPEC int POMPI_Split_recv(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Status *status); +OMPI_DECLSPEC int POMPI_Split_send(const void *buf, int count, MPI_Datatype datatype, int dest, + int tag, MPI_Comm comm); diff --git a/ompi/mpiext/split/c/mpiext_split_recv.c b/ompi/mpiext/split/c/mpiext_split_recv.c new file mode 100644 index 00000000000..c79040b7798 --- /dev/null +++ b/ompi/mpiext/split/c/mpiext_split_recv.c @@ -0,0 +1,99 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/memchecker.h" + +#include "ompi/mpiext/split/c/mpiext_split_c.h" + +#if OMPI_BUILD_MPI_PROFILING +#if OPAL_HAVE_WEAK_SYMBOLS +#pragma weak OMPI_Split_recv = POMPI_Split_recv +#endif +#define OMPI_Split_recv POMPI_Split_recv +#endif + +static const char FUNC_NAME[] = "OMPI_Split_recv"; + + +int OMPI_Split_recv(void *buf, int count, MPI_Datatype type, int source, + int tag, MPI_Comm comm, MPI_Status *statuses) +{ + int rc = MPI_SUCCESS; + opal_convertor_t convertor; + size_t offset; + size_t size; + + MEMCHECKER( + memchecker_datatype(type); + memchecker_call(&opal_memchecker_base_isaddressable, buf, count, type); + memchecker_comm(comm); + ); + + if ( MPI_PARAM_CHECK ) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + OMPI_CHECK_DATATYPE_FOR_RECV(rc, type, count); + OMPI_CHECK_USER_BUFFER(rc, buf, type, count); + + if (ompi_comm_invalid(comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); + } else if (((tag < 0) && (tag != MPI_ANY_TAG)) || (tag > mca_pml.pml_max_tag)) { + rc = MPI_ERR_TAG; + } else if ((source != MPI_ANY_SOURCE) && + (MPI_PROC_NULL != source) && + ompi_comm_peer_invalid(comm, source)) { + rc = MPI_ERR_RANK; + } + + OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == source) { + return MPI_SUCCESS; + } + assert(count > 0); + + if (count > 0) { + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,source); + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(type->super), + count, + buf, + 0, + &convertor ); + opal_convertor_get_unpacked_size( &convertor, &size ); + } + size = size / 2; + offset = 0; + opal_convertor_set_position(&convertor, &offset); + OPAL_CR_ENTER_LIBRARY(); + rc = MCA_PML_CALL(crecv(&convertor, &size, source, tag, comm, (MPI_STATUSES_IGNORE==statuses)?MPI_STATUS_IGNORE:statuses)); + if (OMPI_SUCCESS != rc) { + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); + } + offset += size; + opal_convertor_set_position(&convertor, &offset); + rc = MCA_PML_CALL(crecv(&convertor, &size, source, tag, comm, (MPI_STATUSES_IGNORE==statuses)?MPI_STATUS_IGNORE:statuses+1)); + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); +} diff --git a/ompi/mpiext/split/c/profile/Makefile.am b/ompi/mpiext/split/c/profile/Makefile.am index 99f5b174b8c..9eb8224f2f6 100644 --- a/ompi/mpiext/split/c/profile/Makefile.am +++ b/ompi/mpiext/split/c/profile/Makefile.am @@ -49,6 +49,7 @@ ompidir = $(ompiincludedir)/ompi/mpiext/split/c nodist_libpmpiext_split_c_la_SOURCES = \ pmpiext_isplit_recv.c \ pmpiext_isplit_send.c \ + pmpiext_split_recv.c \ pmpiext_split_send.c # From f4628a5caca688b7eaab4450e09acf395baad846 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 21 Jun 2016 17:24:56 +0900 Subject: [PATCH 7/8] checkpoint --- ompi/mca/coll/base/coll_base_bcast.c | 663 +++++++++++++++++++-------- 1 file changed, 468 insertions(+), 195 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 3b99573d7ce..a95af671d52 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -236,7 +236,6 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, ompi_coll_tree_t* tree ) { int err = 0, line, i = 0, rank; - opal_convertor_t send_convertors[2], recv_convertors[2]; size_t offset = 0; size_t next_offset; size_t size; @@ -252,16 +251,6 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, #endif rank = ompi_comm_rank(comm); -#if 1 - /* FIXME OBJ_RETAIN(datatype) ? */ - OBJ_CONSTRUCT(&send_convertors[0], opal_convertor_t); - OBJ_CONSTRUCT(&send_convertors[1], opal_convertor_t); - OBJ_CONSTRUCT(&recv_convertors[0], opal_convertor_t); - OBJ_CONSTRUCT(&recv_convertors[1], opal_convertor_t); - send_convertors[0].stack_pos = -1; - send_convertors[1].stack_pos = -1; - recv_convertors[0].stack_pos = -1; - recv_convertors[1].stack_pos = -1; if( tree->tree_nextsize != 0 ) { send_reqs = coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize); if( NULL == send_reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } @@ -269,7 +258,12 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, /* Root code */ if( rank == root ) { - ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[i]); + opal_convertor_t send_convertors[2]; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[0]); + OBJ_CONSTRUCT(&send_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&send_convertors[1], opal_convertor_t); + send_convertors[0].stack_pos = -1; + send_convertors[1].stack_pos = -1; /* We will create a convertor specialized for the */ /* remote architecture and prepared with the type. */ opal_convertor_copy_and_prepare_for_send( @@ -299,8 +293,29 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, } size = next_offset - offset; - for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(icsend(&send_convertors[sc_index], + err = MCA_PML_CALL(icsend(&send_convertors[sc_index], + &size, + tree->tree_next[0], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm, + &send_reqs[0])); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + for( i = 1; i < tree->tree_nextsize; i++ ) { + opal_convertor_t send_convertor; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[0]); + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + opal_convertor_set_position(&send_convertor, &offset); + err = MCA_PML_CALL(icsend(&send_convertor, &size, tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, @@ -321,7 +336,15 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, /* Intermediate nodes code */ else if( tree->tree_nextsize > 0 ) { -#if 0 + opal_convertor_t send_convertors[2], recv_convertors[2]; + OBJ_CONSTRUCT(&send_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&send_convertors[1], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[1], opal_convertor_t); + send_convertors[0].stack_pos = -1; + send_convertors[1].stack_pos = -1; + recv_convertors[0].stack_pos = -1; + recv_convertors[1].stack_pos = -1; /* Create the pipeline. 1) Post the first receive @@ -333,30 +356,75 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, 4) Compute number of elements in last segment. 5) Send the last segment to children */ - req_index = 0; - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_prev); + size_t sizes[2], offsets[2]; + offsets[0] = 0; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[0] ); + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[1] ); + opal_convertor_set_position(&recv_convertors[0], &offsets[0]); + next_offset = offsets[0] + (segment_sizetree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[rc_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( segindex = 1; segindex < num_segments; segindex++ ) { - - req_index = req_index ^ 0x1; - - /* post new irecv */ - err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment, - datatype, tree->tree_prev, - MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); + offsets[1] = sizes[0]; + remaining -= sizes[0]; + while (remaining) { + rc_index ^= 1; + next_offset = offsets[rc_index] + (segment_sizetree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_reqs[rc_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - /* wait for and forward the previous segment to children */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], - MPI_STATUS_IGNORE ); + /* wait on the previous segment */ + err = ompi_request_wait( &recv_reqs[rc_index^1], MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype, + opal_convertor_t send_convertor; + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm, tree->tree_next[i]); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + opal_convertor_set_position(&send_convertor, &offsets[rc_index^1]); + err = MCA_PML_CALL(icsend(&send_convertor, &sizes[rc_index^1], tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm, @@ -368,18 +436,30 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - /* Update the receive buffer */ - tmpbuf += realsegsize; - + offsets[rc_index^1] = next_offset; + remaining -= sizes[rc_index]; } /* Process the last segment */ - err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); + err = ompi_request_wait( &recv_reqs[rc_index], MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment; + for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, + opal_convertor_t send_convertor; + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm, tree->tree_next[i]); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + opal_convertor_set_position(&send_convertor, &offsets[rc_index]); + err = MCA_PML_CALL(icsend(&send_convertor, &sizes[rc_index], tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm, @@ -387,18 +467,20 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } + /* complete the sends before starting the next iteration */ err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#else - assert(0); -#endif } /* Leaf nodes */ else { -#if 1 + opal_convertor_t recv_convertors[2]; ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_prev); + OBJ_CONSTRUCT(&recv_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[1], opal_convertor_t); + recv_convertors[0].stack_pos = -1; + recv_convertors[1].stack_pos = -1; /* We will create a convertor specialized for the */ /* remote architecture and prepared with the type. */ opal_convertor_copy_and_prepare_for_recv( @@ -444,38 +526,6 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, err = ompi_request_wait( &recv_reqs[rc_index^1], MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#else - /* - Receive all segments from parent in a loop: - 1) post irecv for the first segment - 2) for segments 1 .. num_segments - - post irecv for the next segment - - wait on the previous segment to arrive - 3) wait for the last segment - */ - req_index = 0; - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( segindex = 1; segindex < num_segments; segindex++ ) { - req_index = req_index ^ 0x1; - tmpbuf += realsegsize; - /* post receive for the next segment */ - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - /* wait on the previous segment */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], - MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } -#endif } return (MPI_SUCCESS); @@ -488,7 +538,6 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, if( NULL != send_reqs ) { ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); } -#endif return err; } @@ -582,16 +631,13 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, mca_coll_base_module_t *module, uint32_t segsize ) { - int err=0, line, rank, size, segindex, i, lr, pair; - uint32_t counts[2]; - int segcount[2]; /* Number of elements sent with each segment */ - int num_segments[2]; /* Number of segmenets */ - int sendcount[2]; /* the same like segcount, except for the last segment */ - size_t realsegsize[2], type_size; - char *tmpbuf[2]; - ptrdiff_t type_extent, lb; - ompi_request_t *base_req, *new_req; + int err=0, line, rank, size, i, lr, pair; + size_t type_size; ompi_coll_tree_t *tree; + opal_convertor_t send_convertors[2], recv_convertors[2]; + size_t remainings[2], sizes[2]; + ompi_proc_t *proc; + int rc_index = 0; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -608,47 +654,79 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, err = ompi_datatype_type_size( datatype, &type_size ); - /* Determine number of segments and number of elements per segment */ - counts[0] = count/2; - if (count % 2 != 0) counts[0]++; - counts[1] = count - counts[0]; - if ( segsize > 0 ) { - /* Note that ompi_datatype_type_size() will never return a negative - value in typelng; it returns an int [vs. an unsigned type] - because of the MPI spec. */ - if (segsize < ((uint32_t) type_size)) { - segsize = type_size; /* push segsize up to hold one type */ + sizes[1] = type_size * count ; + sizes[0] = sizes[1] / 2; + + OBJ_CONSTRUCT(&send_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&send_convertors[1], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[0], opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertors[1], opal_convertor_t); + send_convertors[0].stack_pos = -1; + send_convertors[1].stack_pos = -1; + recv_convertors[0].stack_pos = -1; + recv_convertors[1].stack_pos = -1; + + if (rank == root || tree->tree_nextsize > 0) { + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + proc = ompi_comm_peer_lookup(comm,tree->tree_next[0]); + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertors[0] ); + if (tree->tree_nextsize > 1) { + proc = ompi_comm_peer_lookup(comm, tree->tree_next[1]); + } else { + proc = ompi_comm_peer_lookup(comm, (root+size-1)%size); } - segcount[0] = segcount[1] = segsize / type_size; - num_segments[0] = counts[0]/segcount[0]; - if ((counts[0] % segcount[0]) != 0) num_segments[0]++; - num_segments[1] = counts[1]/segcount[1]; - if ((counts[1] % segcount[1]) != 0) num_segments[1]++; - } else { - segcount[0] = counts[0]; - segcount[1] = counts[1]; - num_segments[0] = num_segments[1] = 1; + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertors[1] ); + opal_convertor_set_position(&send_convertors[1], &sizes[0]); + } + if (rank != root) { + /* Just consume segments as fast as possible */ + proc = ompi_comm_peer_lookup(comm, tree->tree_prev); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[0]); + opal_convertor_set_position(&recv_convertors[0], &sizes[0]); + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertors[1]); } - + sizes[1] -= sizes[0]; + remainings[0] = sizes[0]; + remainings[1] = sizes[1]; + /* if the message is too small to be split into segments */ - if( (counts[0] == 0 || counts[1] == 0) || - (segsize > ((ptrdiff_t)counts[0] * type_size)) || - (segsize > ((ptrdiff_t)counts[1] * type_size)) ) { + if(0 == sizes[0] || 0 == sizes[1]) { /* call linear version here ! */ return (ompi_coll_base_bcast_intra_chain ( buffer, count, datatype, root, comm, module, segsize, 1 )); } - - err = ompi_datatype_get_extent (datatype, &lb, &type_extent); - - /* Determine real segment size */ - realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent; - realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent; - - /* set the buffer pointers */ - tmpbuf[0] = (char *) buffer; - tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent; + if (0 == segsize) { + // segsize = max(sizes); + segsize = sizes[1]; + } /* Step 1: Root splits the buffer in 2 and sends segmented message down the branches. @@ -661,26 +739,34 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, /* root code */ if( rank == root ) { - /* determine segment count */ - sendcount[0] = segcount[0]; - sendcount[1] = segcount[1]; /* for each segment */ - for (segindex = 0; segindex < num_segments[0]; segindex++) { + while(0 != remainings[0] || (tree->tree_nextsize > 1 && 0 != remainings[1])) { /* for each child */ for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) { - if (segindex >= num_segments[i]) { /* no more segments */ + size_t segment_size; + size_t offset, next_offset; + if (0 == remainings[i]) { /* no more data to send */ continue; } /* determine how many elements are being sent in this round */ - if(segindex == (num_segments[i] - 1)) - sendcount[i] = counts[i] - segindex*segcount[i]; + offset = sizes[i] - remainings[i]; + next_offset = offset + segsize; + if (next_offset > sizes[i]) { + next_offset = sizes[i]; + } + if (0 != i) { + offset += sizes[0]; + next_offset += sizes[0]; + } + opal_convertor_set_position(&send_convertors[i], &next_offset); + segment_size = next_offset - offset; + opal_convertor_set_position(&send_convertors[i], &offset); /* send data */ - MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype, + MCA_PML_CALL(csend(&send_convertors[i], &segment_size, tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - /* update tmp buffer */ - tmpbuf[i] += realsegsize[i]; + remainings[i] -= segment_size; } } } @@ -698,68 +784,168 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, * post the next receive and after that wait for the previous receive to complete * and we disseminating the data to all children. */ - sendcount[lr] = segcount[lr]; - err = MCA_PML_CALL(irecv(tmpbuf[lr], sendcount[lr], datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &base_req)); + size_t offsets[2]; + size_t segment_sizes[2]; + ompi_request_t *reqs[2]; + /* determine how many elements are being sent in this round */ + offsets[0] = 0; + if (0 != lr) { + offsets[0] += sizes[0]; + } + opal_convertor_set_position(&recv_convertors[0], &offsets[0]); + offsets[0] += sizes[lr] - remainings[lr]; + offsets[1] = offsets[0] + segsize; + if (offsets[1] > sizes[lr]) { + offsets[1] = sizes[lr]; + } + opal_convertor_set_position(&recv_convertors[1], &offsets[1]); + if (offsets[1] == offsets[0]) { + segment_sizes[0] = remainings[lr]; + } else { + segment_sizes[0] = offsets[1] - offsets[0]; + } + /* send recv */ + MCA_PML_CALL(icrecv(&recv_convertors[0], &segment_sizes[0], + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &reqs[0])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( segindex = 1; segindex < num_segments[lr]; segindex++ ) { - /* determine how many elements to expect in this round */ - if( segindex == (num_segments[lr] - 1)) - sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr]; - /* post new irecv */ - err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr], - datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &new_req)); + remainings[lr] -= segment_sizes[0]; + while (0 != remainings[lr]) { + size_t next_offset; + rc_index ^= 1; + /* determine how many elements are being sent in this round */ + offsets[rc_index] = sizes[lr] - remainings[lr]; + next_offset = offsets[rc_index] + segsize; + if (next_offset > sizes[lr]) { + next_offset = sizes[lr]; + } + if (0 != lr) { + offsets[rc_index] += sizes[0]; + next_offset += sizes[0]; + } + opal_convertor_set_position(&recv_convertors[rc_index^1], &next_offset); + if (next_offset == offsets[rc_index]) { + segment_sizes[rc_index] = remainings[lr]; + } else { + segment_sizes[rc_index] = next_offset - offsets[rc_index]; + } + /* send recv */ + MCA_PML_CALL(icrecv(&recv_convertors[rc_index], &segment_sizes[rc_index], + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &reqs[rc_index])); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - + remainings[lr] -= segment_sizes[rc_index]; /* wait for and forward the previous segment */ - err = ompi_request_wait( &base_req, MPI_STATUS_IGNORE ); + err = ompi_request_wait( &reqs[rc_index^1], MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */ - err = MCA_PML_CALL(send( tmpbuf[lr], segcount[lr], datatype, - tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, + opal_convertor_t send_convertor; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[i]); + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + opal_convertor_set_position(&send_convertor, &offsets[rc_index^1]); + err = MCA_PML_CALL(csend(&send_convertor, + &segment_sizes[rc_index^1], + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* end of for each child */ - - /* upate the base request */ - base_req = new_req; - /* go to the next buffer (ie. the one corresponding to the next recv) */ - tmpbuf[lr] += realsegsize[lr]; - } /* end of for segindex */ - - /* wait for the last segment and forward current segment */ - err = ompi_request_wait( &base_req, MPI_STATUS_IGNORE ); - for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */ - err = MCA_PML_CALL(send(tmpbuf[lr], sendcount[lr], datatype, - tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm)); + offsets[rc_index^1] = next_offset; + } + err = ompi_request_wait( &reqs[rc_index], MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */ + opal_convertor_t send_convertor; + ompi_proc_t* proc = ompi_comm_peer_lookup(comm,tree->tree_next[i]); + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + opal_convertor_set_position(&send_convertor, &offsets[rc_index]); + err = MCA_PML_CALL(csend(&send_convertor, + &segment_sizes[rc_index], + tree->tree_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* end of for each child */ } /* leaf nodes */ else { - /* Just consume segments as fast as possible */ - sendcount[lr] = segcount[lr]; - for (segindex = 0; segindex < num_segments[lr]; segindex++) { - /* determine how many elements to expect in this round */ - if (segindex == (num_segments[lr] - 1)) - sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr]; - /* receive segments */ - err = MCA_PML_CALL(recv(tmpbuf[lr], sendcount[lr], datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, MPI_STATUS_IGNORE)); + size_t offset, next_offset; + size_t segment_size; + /* determine how many elements are being sent in this round */ + offset = lr?sizes[0]:0; + opal_convertor_set_position(&recv_convertors[0], &offset); + offset = sizes[lr] - remainings[lr]; + next_offset = offset + segsize; + if (next_offset > sizes[lr]) { + next_offset = sizes[lr]; + } + if (0 != lr) { + offset += sizes[0]; + next_offset += sizes[0]; + } + opal_convertor_set_position(&recv_convertors[1], &next_offset); + if (next_offset == offset) { + segment_size = remainings[lr]; + } else { + segment_size = next_offset - offset; + } + /* send recv */ + MCA_PML_CALL(crecv(&recv_convertors[0], &segment_size, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUSES_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + remainings[lr] -= segment_size; + while (0 != remainings[lr]) { + rc_index ^= 1; + /* determine how many elements are being sent in this round */ + offset = sizes[lr] - remainings[lr]; + next_offset = offset + segsize; + if (next_offset > sizes[lr]) { + next_offset = sizes[lr]; + } + if (0 != lr) { + offset += sizes[0]; + next_offset += sizes[0]; + } + opal_convertor_set_position(&recv_convertors[rc_index^1], &next_offset); + if (next_offset == offset) { + segment_size = remainings[lr]; + } else { + segment_size = next_offset - offset; + } + /* send recv */ + MCA_PML_CALL(crecv(&recv_convertors[rc_index], &segment_size, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUSES_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - /* update the initial pointer to the buffer */ - tmpbuf[lr] += realsegsize[lr]; + remainings[lr] -= segment_size; } } /* reset the buffer pointers */ - tmpbuf[0] = (char *) buffer; - tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent; + remainings[0] = sizes[0]; + remainings[1] = sizes[1]; /* Step 2: Find your immediate pair (identical node in opposite subtree) and SendRecv @@ -777,36 +963,123 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, } if ( (size%2) != 0 && rank != root) { - - err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype, - pair, MCA_COLL_BASE_TAG_BCAST, - tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, - pair, MCA_COLL_BASE_TAG_BCAST, - comm, MPI_STATUS_IGNORE, rank); + size_t offset; + opal_convertor_t send_convertor, recv_convertor; + ompi_request_t *req; + ompi_proc_t *proc; + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + recv_convertor.stack_pos = -1; + proc = ompi_comm_peer_lookup(comm, pair); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + offset = lr?sizes[0]:0; + opal_convertor_set_position(&send_convertor, &offset); + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertor); + offset = ((lr+1)%2)?sizes[0]:0; + opal_convertor_set_position(&recv_convertor, &offset); + err = MCA_PML_CALL(icrecv(&recv_convertor, + &sizes[(lr+1)%2], + pair, MCA_COLL_BASE_TAG_BCAST, + comm, &req)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + MCA_PML_CALL(csend(&send_convertor, &sizes[lr], + pair, MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + err = ompi_request_wait( &req, MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } else if ( (size%2) == 0 ) { /* root sends right buffer to the last node */ if( rank == root ) { - err = MCA_PML_CALL(send(tmpbuf[1], counts[1], datatype, - (root+size-1)%size, MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - + if (0 != sizes[1]) { + /* determine how many elements are being sent in this round */ + opal_convertor_set_position(&send_convertors[1], &sizes[0]); + MCA_PML_CALL(csend(&send_convertors[1], &sizes[1], + (root+size-1)%size, MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } } /* last node receives right buffer from the root */ else if (rank == (root+size-1)%size) { - err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype, - root, MCA_COLL_BASE_TAG_BCAST, - comm, MPI_STATUS_IGNORE)); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + opal_convertor_t recv_convertor; + proc = ompi_comm_peer_lookup(comm, root); + OBJ_CONSTRUCT(&recv_convertor, opal_convertor_t); + recv_convertor.stack_pos = -1; + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertor); + if (0 != sizes[1]) { + /* determine how many elements are being sent in this round */ + opal_convertor_set_position(&recv_convertor, &sizes[0]); + MCA_PML_CALL(crecv(&recv_convertor, &sizes[1], + root, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUSES_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } } /* everyone else exchanges buffers */ else { - err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype, - pair, MCA_COLL_BASE_TAG_BCAST, - tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, - pair, MCA_COLL_BASE_TAG_BCAST, - comm, MPI_STATUS_IGNORE, rank); + size_t offset; + opal_convertor_t send_convertor, recv_convertor; + ompi_request_t *req; + ompi_proc_t *proc; + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + OBJ_CONSTRUCT(&recv_convertor, opal_convertor_t); + send_convertor.stack_pos = -1; + recv_convertor.stack_pos = -1; + proc = ompi_comm_peer_lookup(comm, pair); + /* We will create a convertor specialized for the */ + /* remote architecture and prepared with the type. */ + opal_convertor_copy_and_prepare_for_send( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &send_convertor); + offset = lr?sizes[0]:0; + opal_convertor_set_position(&send_convertor, &offset); + opal_convertor_copy_and_prepare_for_recv( + proc->super.proc_convertor, + &(datatype->super), + count, + buffer, + 0, + &recv_convertor); + offset = ((lr+1)%2)?sizes[0]:0; + opal_convertor_set_position(&recv_convertor, &offset); + err = MCA_PML_CALL(icrecv(&recv_convertor, + &sizes[(lr+1)%2], + pair, MCA_COLL_BASE_TAG_BCAST, + comm, &req)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + MCA_PML_CALL(csend(&send_convertor, &sizes[lr], + pair, MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + err = ompi_request_wait( &req, MPI_STATUS_IGNORE ); if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } } From 6ae2502cb623113d4609bdf63dff313c0f307890 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 23 Jun 2016 11:00:02 +0900 Subject: [PATCH 8/8] checkpoint --- ompi/mca/coll/base/coll_base_bcast.c | 198 +-------------------------- 1 file changed, 4 insertions(+), 194 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index a95af671d52..aede9089353 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -36,197 +36,6 @@ int ompi_coll_base_bcast_intra_generic( void* buffer, - int count, - struct ompi_datatype_t* datatype, - int root, - struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, - size_t segment_size, - ompi_coll_tree_t* tree ) -{ - int err = 0, line, i, rank, segindex, req_index; - int num_segments; /* Number of segments */ - int sendcount; /* number of elements sent in this segment */ - size_t realsegsize, type_size; - char *tmpbuf; - ptrdiff_t extent, lb; - ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; - ompi_request_t **send_reqs = NULL; - -#if OPAL_ENABLE_DEBUG - int size; - size = ompi_comm_size(comm); - assert( size > 1 ); -#endif - rank = ompi_comm_rank(comm); - - ompi_datatype_get_extent (datatype, &lb, &extent); - ompi_datatype_type_size( datatype, &type_size ); - num_segments = (original_count + count_by_segment - 1) / count_by_segment; - realsegsize = (ptrdiff_t)count_by_segment * extent; - - /* Set the buffer pointers */ - tmpbuf = (char *) buffer; - - if( tree->tree_nextsize != 0 ) { - send_reqs = ompi_coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize); - if( NULL == send_reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } - } - - /* Root code */ - if( rank == root ) { - /* - For each segment: - - send segment to all children. - The last segment may have less elements than other segments. - */ - sendcount = count_by_segment; - for( segindex = 0; segindex < num_segments; segindex++ ) { - if( segindex == (num_segments - 1) ) { - sendcount = original_count - segindex * count_by_segment; - } - for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, - tree->tree_next[i], - MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, - &send_reqs[i])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - /* complete the sends before starting the next sends */ - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, - MPI_STATUSES_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - /* update tmp buffer */ - tmpbuf += realsegsize; - - } - } - - /* Intermediate nodes code */ - else if( tree->tree_nextsize > 0 ) { - /* - Create the pipeline. - 1) Post the first receive - 2) For segments 1 .. num_segments - - post new receive - - wait on the previous receive to complete - - send this data to children - 3) Wait on the last segment - 4) Compute number of elements in last segment. - 5) Send the last segment to children - */ - req_index = 0; - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( segindex = 1; segindex < num_segments; segindex++ ) { - - req_index = req_index ^ 0x1; - - /* post new irecv */ - err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment, - datatype, tree->tree_prev, - MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - /* wait for and forward the previous segment to children */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], - MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype, - tree->tree_next[i], - MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, - &send_reqs[i])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - /* complete the sends before starting the next iteration */ - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, - MPI_STATUSES_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - /* Update the receive buffer */ - tmpbuf += realsegsize; - - } - - /* Process the last segment */ - err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment; - for( i = 0; i < tree->tree_nextsize; i++ ) { - err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype, - tree->tree_next[i], - MCA_COLL_BASE_TAG_BCAST, - MCA_PML_BASE_SEND_STANDARD, comm, - &send_reqs[i])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - err = ompi_request_wait_all( tree->tree_nextsize, send_reqs, - MPI_STATUSES_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - /* Leaf nodes */ - else { - /* - Receive all segments from parent in a loop: - 1) post irecv for the first segment - 2) for segments 1 .. num_segments - - post irecv for the next segment - - wait on the previous segment to arrive - 3) wait for the last segment - */ - req_index = 0; - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - - for( segindex = 1; segindex < num_segments; segindex++ ) { - req_index = req_index ^ 0x1; - tmpbuf += realsegsize; - /* post receive for the next segment */ - err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, - tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, - comm, &recv_reqs[req_index])); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - /* wait on the previous segment */ - err = ompi_request_wait( &recv_reqs[req_index ^ 0x1], - MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); - if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - - return (MPI_SUCCESS); - - error_hndl: - OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank) ); - (void)line; // silence compiler warnings - ompi_coll_base_free_reqs( recv_reqs, 2); - if( NULL != send_reqs ) { - ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); - } - - return err; -} - -int -ompi_coll_base_bcast_intra_generic2( void* buffer, int count, struct ompi_datatype_t* datatype, int root, @@ -252,7 +61,7 @@ ompi_coll_base_bcast_intra_generic2( void* buffer, rank = ompi_comm_rank(comm); if( tree->tree_nextsize != 0 ) { - send_reqs = coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize); + send_reqs = ompi_coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize); if( NULL == send_reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } } @@ -793,10 +602,11 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, offsets[0] += sizes[0]; } opal_convertor_set_position(&recv_convertors[0], &offsets[0]); + offsets[0] += sizes[lr] - remainings[lr]; offsets[1] = offsets[0] + segsize; - if (offsets[1] > sizes[lr]) { - offsets[1] = sizes[lr]; + if (offsets[1] > sizes[lr] + (lr?sizes[0]:0)) { + offsets[1] = sizes[lr] + (lr?sizes[0]:0); } opal_convertor_set_position(&recv_convertors[1], &offsets[1]); if (offsets[1] == offsets[0]) {