diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am deleted file mode 100644 index e1015f2823e..00000000000 --- a/opal/mca/btl/uct/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights -# reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(btl_uct_CPPFLAGS) - -amca_paramdir = $(AMCA_PARAM_SETS_DIR) - -sources = \ - btl_uct.h \ - btl_uct_module.c \ - btl_uct_component.c \ - btl_uct_rdma.h \ - btl_uct_rdma.c \ - btl_uct_endpoint.h \ - btl_uct_endpoint.c \ - btl_uct_amo.c \ - btl_uct_am.h \ - btl_uct_am.c \ - btl_uct_frag.h \ - btl_uct_frag.c \ - btl_uct_tl.c \ - btl_uct_types.h \ - btl_uct_device_context.h - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_opal_btl_uct_DSO -lib = -lib_sources = -component = mca_btl_uct.la -component_sources = $(sources) -else -lib = libmca_btl_uct.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(opallibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_btl_uct_la_SOURCES = $(component_sources) -mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) -mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la $(btl_uct_LIBS) - -noinst_LTLIBRARIES = $(lib) -libmca_btl_uct_la_SOURCES = $(lib_sources) -libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) -libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h deleted file mode 100644 index 38756794430..00000000000 --- a/opal/mca/btl/uct/btl_uct.h +++ /dev/null @@ -1,327 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_BTL_UCT_H -#define MCA_BTL_UCT_H - -#include "opal_config.h" -#include -#include - -/* Open MPI includes */ -#include "opal/mca/event/event.h" -#include "opal/mca/btl/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/mca/btl/base/btl_base_error.h" -#include "opal/mca/rcache/base/base.h" -#include "opal/class/opal_fifo.h" -#include "opal/class/opal_hash_table.h" -#include "opal/mca/pmix/pmix.h" -#include "opal/threads/tsd.h" -#include - -#include "btl_uct_types.h" - -BEGIN_C_DECLS - -/* detection for old vs new atomic flags */ -#if defined(UCT_IFACE_FLAG_ATOMIC_ADD32) -#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 0 -#else -#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 1 -#endif - -/** - * @brief UCT BTL module - */ -struct mca_btl_uct_module_t { - /** base BTL interface */ - mca_btl_base_module_t super; - - /** whether the module has been fully initialized or not */ - bool initialized; - - /** lock for the hash table */ - opal_mutex_t endpoint_lock; - - /** endpoint hash table */ - opal_hash_table_t id_to_endpoint; - - /** mutex to protect the module */ - opal_recursive_mutex_t lock; - - /** async context */ - ucs_async_context_t *ucs_async; - - /** transport for active messaging */ - mca_btl_uct_tl_t *am_tl; - - /** transport for RDMA/AMOs */ - mca_btl_uct_tl_t *rdma_tl; - - /** transport for forming connections (if needed) */ - mca_btl_uct_tl_t *conn_tl; - - /** array containing the am_tl and rdma_tl */ - mca_btl_uct_tl_t *comm_tls[2]; - - /** registration cache */ - mca_rcache_base_module_t *rcache; - - /** name of the memory domain backing this module */ - char *md_name; - - /** am and rdma share endpoints */ - bool shared_endpoints; - - /** memory domain */ - mca_btl_uct_md_t *md; - - /** un-registered frags that will be used with uct_ep_am_short() */ - opal_free_list_t short_frags; - - /** registered frags that will be used with uct_ep_am_zcopy() */ - opal_free_list_t eager_frags; - - /** large registered frags for packing non-contiguous data */ - opal_free_list_t max_frags; - - /** frags that were waiting on connections that are now ready to send */ - opal_list_t pending_frags; - - /** pending connection requests */ - opal_fifo_t pending_connection_reqs; -}; -typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; - -extern mca_btl_uct_module_t mca_btl_uct_module_template; - -/** - * @brief UCT BTL component - */ -struct mca_btl_uct_component_t { - /** base BTL component */ - mca_btl_base_component_3_0_0_t super; - - /** number of TL modules */ - int module_count; - - /** All BTL UCT modules (1 per memory domain) */ - mca_btl_uct_module_t *modules[MCA_BTL_UCT_MAX_MODULES]; - - /** allowed UCT memory domains */ - char *memory_domains; - - /** allowed transports */ - char *allowed_transports; - - /** number of worker contexts to create */ - int num_contexts_per_module; - -#if OPAL_C_HAVE__THREAD_LOCAL - /** bind threads to contexts */ - bool bind_threads_to_contexts; -#endif - - /** disable UCX memory hooks */ - bool disable_ucx_memory_hooks; -}; -typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; - -OPAL_MODULE_DECLSPEC extern mca_btl_uct_component_t mca_btl_uct_component; - -struct mca_btl_base_registration_handle_t { - /** The packed memory handle. The size of this field is defined by UCT. */ - uint8_t packed_handle[1]; -}; - -struct mca_btl_uct_reg_t { - mca_rcache_base_registration_t base; - - /** UCT memory handle */ - uct_mem_h uct_memh; - - /** remote handle */ - mca_btl_base_registration_handle_t handle; -}; -typedef struct mca_btl_uct_reg_t mca_btl_uct_reg_t; - -OBJ_CLASS_DECLARATION(mca_btl_uct_reg_t); - -#define MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(reg) ((mca_btl_uct_reg_t *)((intptr_t) (reg) - offsetof (mca_btl_uct_reg_t, handle))) - -/** - * Initiate an asynchronous put. - * Completion Semantics: if this function returns a 1 then the operation - * is complete. a return of OPAL_SUCCESS indicates - * the put operation has been queued with the - * network. the local_handle can not be deregistered - * until all outstanding operations on that handle - * have been completed. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param local_address (IN) Local address to put from (registered) - * @param remote_address (IN) Remote address to put to (registered remotely) - * @param local_handle (IN) Registration handle for region containing - * (local_address, local_address + size) - * @param remote_handle (IN) Remote registration handle for region containing - * (remote_address, remote_address + size) - * @param size (IN) Number of bytes to put - * @param flags (IN) Flags for this put operation - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion (if queued) - * @param cbcontext (IN) Context for the callback - * @param cbdata (IN) Data for callback - * - * @retval OPAL_SUCCESS The descriptor was successfully queued for a put - * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put - * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put - * operation. Try again later - * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or - * alignment restrictions. - */ -int mca_btl_uct_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - -/** - * Initiate an asynchronous get. - * Completion Semantics: if this function returns a 1 then the operation - * is complete. a return of OPAL_SUCCESS indicates - * the get operation has been queued with the - * network. the local_handle can not be deregistered - * until all outstanding operations on that handle - * have been completed. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param local_address (IN) Local address to put from (registered) - * @param remote_address (IN) Remote address to put to (registered remotely) - * @param local_handle (IN) Registration handle for region containing - * (local_address, local_address + size) - * @param remote_handle (IN) Remote registration handle for region containing - * (remote_address, remote_address + size) - * @param size (IN) Number of bytes to put - * @param flags (IN) Flags for this put operation - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion (if queued) - * @param cbcontext (IN) Context for the callback - * @param cbdata (IN) Data for callback - * - * @retval OPAL_SUCCESS The descriptor was successfully queued for a put - * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put - * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put - * operation. Try again later - * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or - * alignment restrictions. - */ -int mca_btl_uct_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - - /** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_btl_uct_ft_event(int state); - -int mca_btl_uct_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - -int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, - uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); - -int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - - -int mca_btl_uct_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); -int mca_btl_uct_flush_thread (mca_btl_base_module_t *btl); - -int mca_btl_uct_finalize (mca_btl_base_module_t *btl); - -int mca_btl_uct_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg); -int mca_btl_uct_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); - -ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags); - -struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc); - -int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); -int mca_btl_uct_process_connection_request (mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); - -/** - * @brief Checks if a tl is suitable for using for RDMA - * - * @param[in] tl btl/uct tl pointer - */ -static inline bool mca_btl_uct_tl_supports_rdma (mca_btl_uct_tl_t *tl) -{ - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == - (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); -} - -/** - * @brief Checks if a tl is suitable for using for active messaging - */ -static inline bool mca_btl_uct_tl_support_am (mca_btl_uct_tl_t *tl) -{ - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); -} - -/** - * @brief Checks if a tl can be used for passing data to connect endpoints - * - * @param[in] tl btl/uct tl pointer - */ -static inline bool mca_btl_uct_tl_supports_conn (mca_btl_uct_tl_t *tl) -{ - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == - (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); -} - -/** - * @brief Check if tl endpoints need to be connected via a connection tl - * - * @param[in] tl btl/uct tl pointer - */ -static inline bool mca_btl_uct_tl_requires_connection_tl (mca_btl_uct_tl_t *tl) -{ - return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); -} - -END_C_DECLS -#endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c deleted file mode 100644 index 90ea28eed5c..00000000000 --- a/opal/mca/btl/uct/btl_uct_am.c +++ /dev/null @@ -1,343 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct_am.h" -#include "btl_uct_rdma.h" -#include "btl_uct_device_context.h" - -/** - * Allocate a segment. - * - * @param btl (IN) BTL module - * @param size (IN) Request segment size. - */ -mca_btl_base_descriptor_t *mca_btl_uct_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - uint8_t order, size_t size, uint32_t flags) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_base_frag_t *frag = NULL; - - if (size <= (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { - frag = mca_btl_uct_frag_alloc_short (uct_btl, endpoint); - } else if (size <= uct_btl->super.btl_eager_limit) { - frag = mca_btl_uct_frag_alloc_eager (uct_btl, endpoint); - } else { - frag = mca_btl_uct_frag_alloc_max (uct_btl, endpoint); - } - - if (OPAL_LIKELY(frag != NULL)) { - frag->segments[0].seg_len = size; - - frag->base.des_segment_count = 1; - frag->base.des_flags = flags; - frag->base.order = order; - frag->uct_iov.length = size; - if (NULL != frag->base.super.registration) { - /* zero-copy fragments will need callbacks */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - } - } - - return (mca_btl_base_descriptor_t *) frag; -} - -static inline void _mca_btl_uct_send_pack (void *data, void *header, size_t header_size, opal_convertor_t *convertor, - size_t payload_size) -{ - uint32_t iov_count = 1; - struct iovec iov; - size_t length; - - if (header_size > 0) { - assert (NULL != header); - memcpy (data, header, header_size); - } - - /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE *) ((intptr_t) data + header_size); - iov.iov_len = length = payload_size; - - (void) opal_convertor_pack (convertor, &iov, &iov_count, &length); - - assert (length == payload_size); -} - -struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - opal_convertor_t *convertor, - uint8_t order, size_t reserve, - size_t *size, uint32_t flags) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - const size_t total_size = reserve + *size; - mca_btl_uct_base_frag_t *frag; - void *data_ptr; - - /* in place send fragment */ - if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor) || total_size > uct_btl->super.btl_eager_limit)) { - frag = (mca_btl_uct_base_frag_t *) mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - _mca_btl_uct_send_pack ((void *) ((intptr_t) frag->uct_iov.buffer + reserve), NULL, 0, - convertor, *size); - } else { - opal_convertor_get_current_pointer (convertor, &data_ptr); - assert (NULL != data_ptr); - - frag = mca_btl_uct_frag_alloc_short (uct_btl, endpoint); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - frag->uct_iov.length = total_size; - frag->base.order = order; - frag->base.des_flags = flags; - if (total_size > (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { - frag->segments[0].seg_len = reserve; - frag->segments[1].seg_len = *size; - frag->segments[1].seg_addr.pval = data_ptr; - frag->base.des_segment_count = 2; - } else { - frag->segments[0].seg_len = total_size; - memcpy ((void *)((intptr_t) frag->segments[1].seg_addr.pval + reserve), data_ptr, *size); - frag->base.des_segment_count = 1; - } - } - - return &frag->base; -} - -/** - * Return a segment allocated by this BTL. - * - * @param btl (IN) BTL module - * @param segment (IN) Allocated segment. - */ -int mca_btl_uct_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des) -{ - mca_btl_uct_frag_return ((mca_btl_uct_base_frag_t *) des); - return OPAL_SUCCESS; -} - -static size_t mca_btl_uct_send_frag_pack (void *data, void *arg) -{ - mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) arg; - size_t length = 8; - - memcpy (data, &frag->header, sizeof (frag->header)); - data = (void *)((intptr_t) data + 8); - - /* this function should only ever get called with fragments with two segments */ - for (size_t i = 0 ; i < frag->base.des_segment_count ; ++i) { - const size_t seg_len = frag->segments[i].seg_len; - memcpy (data, frag->segments[i].seg_addr.pval, seg_len); - data = (void *)((intptr_t) data + seg_len); - length += seg_len; - } - - return length; -} - -static void mca_btl_uct_append_pending_frag (mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t *frag, - mca_btl_uct_device_context_t *context, bool ready) -{ - frag->ready = ready; - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - opal_atomic_wmb (); - - opal_list_append (&uct_btl->pending_frags, (opal_list_item_t *) frag); -} - -int mca_btl_uct_send_frag (mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t *frag, bool append) -{ - mca_btl_uct_device_context_t *context = frag->context; - const ssize_t msg_size = frag->uct_iov.length + 8; - ssize_t size; - ucs_status_t ucs_status; - uct_ep_h ep_handle = NULL; - - /* if we get here then we must have an endpoint handle for this context/endpoint pair */ - (void) mca_btl_uct_endpoint_test_am (uct_btl, frag->endpoint, frag->context, &ep_handle); - assert (NULL != ep_handle); - - /* if another thread set this we really don't care too much as this flag is only meant - * to protect against deep recursion */ - if (!context->in_am_callback) { - mca_btl_uct_context_lock (context); - /* attempt to post the fragment */ - if (NULL != frag->base.super.registration) { - frag->comp.dev_context = context; - ucs_status = uct_ep_am_zcopy (ep_handle, MCA_BTL_UCT_FRAG, &frag->header, sizeof (frag->header), - &frag->uct_iov, 1, 0, &frag->comp.uct_comp); - - if (OPAL_LIKELY(UCS_INPROGRESS == ucs_status)) { - uct_worker_progress (context->uct_worker); - mca_btl_uct_context_unlock (context); - return OPAL_SUCCESS; - } - } else { - /* short message */ - if (1 == frag->base.des_segment_count && (frag->uct_iov.length + 8) < MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { - ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, frag->header.value, frag->uct_iov.buffer, - frag->uct_iov.length); - - if (OPAL_LIKELY(UCS_OK == ucs_status)) { - uct_worker_progress (context->uct_worker); - mca_btl_uct_context_unlock (context); - /* send is complete */ - mca_btl_uct_frag_complete (frag, OPAL_SUCCESS); - return 1; - } - } - - size = uct_ep_am_bcopy (ep_handle, MCA_BTL_UCT_FRAG, mca_btl_uct_send_frag_pack, frag, 0); - if (OPAL_LIKELY(size == msg_size)) { - uct_worker_progress (context->uct_worker); - mca_btl_uct_context_unlock (context); - /* send is complete */ - mca_btl_uct_frag_complete (frag, OPAL_SUCCESS); - return 1; - } - } - - /* wait for something to happen */ - uct_worker_progress (context->uct_worker); - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - } - - if (!append) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - OPAL_THREAD_LOCK(&uct_btl->lock); - mca_btl_uct_append_pending_frag (uct_btl, frag, context, true); - OPAL_THREAD_UNLOCK(&uct_btl->lock); - - return OPAL_SUCCESS; -} - -int mca_btl_uct_send (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_btl_base_descriptor_t *descriptor, - mca_btl_base_tag_t tag) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_am_context (uct_btl); - mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) descriptor; - uct_ep_h ep_handle; - int rc; - - BTL_VERBOSE(("btl/uct sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor, - OPAL_PROC_MY_NAME.vpid, endpoint->ep_proc->proc_name.vpid, frag->uct_iov.length)); - - - frag->header.data.tag = tag; - frag->context = context; - - rc = mca_btl_uct_endpoint_check_am (uct_btl, endpoint, context, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OPAL_THREAD_LOCK(&uct_btl->lock); - /* check one more time in case another thread is completing the connection now */ - if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am (uct_btl, endpoint, context, &ep_handle)) { - mca_btl_uct_append_pending_frag (uct_btl, frag, context, false); - OPAL_THREAD_UNLOCK(&uct_btl->lock); - return OPAL_SUCCESS; - } - OPAL_THREAD_UNLOCK(&uct_btl->lock); - } - - return mca_btl_uct_send_frag (uct_btl, frag, true); -} - -struct mca_btl_uct_sendi_pack_args_t { - uint64_t am_header; - void *header; - size_t header_size; - opal_convertor_t *convertor; - size_t payload_size; -}; - -typedef struct mca_btl_uct_sendi_pack_args_t mca_btl_uct_sendi_pack_args_t; - -static size_t mca_btl_uct_sendi_pack (void *data, void *arg) -{ - mca_btl_uct_sendi_pack_args_t *args = (mca_btl_uct_sendi_pack_args_t *) arg; - mca_btl_uct_am_header_t *am_header = (mca_btl_uct_am_header_t *) data; - - am_header->value = args->am_header; - _mca_btl_uct_send_pack ((void *)((intptr_t)data + 8), args->header, args->header_size, args->convertor, - args->payload_size); - return args->header_size + args->payload_size + 8; -} - -static inline size_t mca_btl_uct_max_sendi (mca_btl_uct_module_t *uct_btl, int context_id) -{ - return MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context_id).cap.am.max_bcopy; -} - -int mca_btl_uct_sendi (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, opal_convertor_t *convertor, - void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, - mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_am_context (uct_btl); - const size_t total_size = header_size + payload_size; - /* message with header */ - const size_t msg_size = total_size + 8; - mca_btl_uct_am_header_t am_header; - ucs_status_t ucs_status = UCS_ERR_NO_RESOURCE; - uct_ep_h ep_handle; - int rc; - - rc = mca_btl_uct_endpoint_check_am (uct_btl, endpoint, context, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || msg_size > mca_btl_uct_max_sendi (uct_btl, context->context_id))) { - if (descriptor) { - *descriptor = mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); - } - - return OPAL_ERR_OUT_OF_RESOURCE; - } - - am_header.data.tag = tag; - - mca_btl_uct_context_lock (context); - if (0 == payload_size) { - ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header, header_size); - } else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id).cap.am.max_short) { - int8_t *data = alloca (total_size); - _mca_btl_uct_send_pack (data, header, header_size, convertor, payload_size); - ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data, total_size); - } else { - ssize_t size; - - size = uct_ep_am_bcopy (ep_handle, MCA_BTL_UCT_FRAG, mca_btl_uct_sendi_pack, - &(mca_btl_uct_sendi_pack_args_t) {.am_header = am_header.value, - .header = header, .header_size = header_size, - .convertor = convertor, .payload_size = payload_size}, 0); - if (OPAL_LIKELY(size == (ssize_t) msg_size)) { - ucs_status = UCS_OK; - } - } - - mca_btl_uct_context_unlock (context); - - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - if (descriptor) { - *descriptor = mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); - } - - return OPAL_ERR_OUT_OF_RESOURCE; - } - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/uct/btl_uct_am.h b/opal/mca/btl/uct/btl_uct_am.h deleted file mode 100644 index 9035540e710..00000000000 --- a/opal/mca/btl/uct/btl_uct_am.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(MCA_BTL_UCT_AM_H) -#define MCA_BTL_UCT_AM_H - -#include "btl_uct_frag.h" - -struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - opal_convertor_t *convertor, - uint8_t order, size_t reserve, - size_t *size, uint32_t flags); - -int mca_btl_uct_sendi (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, opal_convertor_t *convertor, - void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, - mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor); - -int mca_btl_uct_send (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_btl_base_descriptor_t *descriptor, - mca_btl_base_tag_t tag); - -int mca_btl_uct_send_frag (mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t *frag, bool append); - -mca_btl_base_descriptor_t *mca_btl_uct_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - uint8_t order, size_t size, uint32_t flags); - -int mca_btl_uct_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des); - - -#endif /* !defined(MCA_BTL_UCT_AM_H) */ diff --git a/opal/mca/btl/uct/btl_uct_amo.c b/opal/mca/btl/uct/btl_uct_amo.c deleted file mode 100644 index f7d02326884..00000000000 --- a/opal/mca/btl/uct/btl_uct_amo.c +++ /dev/null @@ -1,190 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct_device_context.h" - -#if OPAL_HAVE_UCT_EP_ATOMIC64_POST -/* we add 1 to the ops to differentiate between unsupported and supported ops since - * UCT_ATOMIC_OP_ADD == 0. otherwise we would have to fill in this table completely. */ -static int mca_btl_uct_btl_to_uct_atomic[MCA_BTL_ATOMIC_LAST] = { - [MCA_BTL_ATOMIC_ADD] = UCT_ATOMIC_OP_ADD + 1, - [MCA_BTL_ATOMIC_AND] = UCT_ATOMIC_OP_AND + 1, - [MCA_BTL_ATOMIC_OR] = UCT_ATOMIC_OP_OR + 1, - [MCA_BTL_ATOMIC_XOR] = UCT_ATOMIC_OP_XOR + 1, - [MCA_BTL_ATOMIC_SWAP] = UCT_ATOMIC_OP_SWAP + 1, -}; -#endif - -int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, - uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); - mca_btl_uct_uct_completion_t *comp = NULL; - ucs_status_t ucs_status; - uct_rkey_bundle_t rkey; - uct_ep_h ep_handle; - int rc; - -#if OPAL_HAVE_UCT_EP_ATOMIC64_POST - int uct_op = mca_btl_uct_btl_to_uct_atomic[op]; - - if (OPAL_UNLIKELY(0 == uct_op--)) { - return OPAL_ERR_BAD_PARAM; - } -#else - if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op && MCA_BTL_ATOMIC_SWAP != op)) { - return OPAL_ERR_BAD_PARAM; - } -#endif - - if (cbfunc) { - comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, - cbfunc, cbcontext, cbdata); - if (OPAL_UNLIKELY(NULL == comp)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_uct_completion_release (comp); - return rc; - } - - mca_btl_uct_context_lock (context); - -#if OPAL_HAVE_UCT_EP_ATOMIC64_POST - if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { - ucs_status = uct_ep_atomic32_fetch (ep_handle, uct_op, operand, (uint32_t *) local_address, remote_address, - rkey.rkey, &comp->uct_comp); - } else { - ucs_status = uct_ep_atomic64_fetch (ep_handle, uct_op, operand, (uint64_t *) local_address, remote_address, - rkey.rkey, &comp->uct_comp); - } -#else - if (MCA_BTL_ATOMIC_ADD == op) { - if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { - ucs_status = uct_ep_atomic_fadd32 (ep_handle, (uint32_t) operand, remote_address, - rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); - } else { - ucs_status = uct_ep_atomic_fadd64 (ep_handle, operand, remote_address, rkey.rkey, - (uint64_t *) local_address, &comp->uct_comp); - } - } else { - if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { - ucs_status = uct_ep_atomic_swap32 (ep_handle, (uint32_t) operand, remote_address, - rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); - } else { - ucs_status = uct_ep_atomic_swap64 (ep_handle, operand, remote_address, rkey.rkey, - (uint64_t *) local_address, &comp->uct_comp); - } - } -#endif - - /* go ahead and progress the worker while we have the lock */ - (void) uct_worker_progress (context->uct_worker); - - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - - if (UCS_INPROGRESS == ucs_status) { - rc = OPAL_SUCCESS; - } else if (UCS_OK == ucs_status) { - rc = 1; - mca_btl_uct_uct_completion_release (comp); - } else { - rc = OPAL_ERR_OUT_OF_RESOURCE; - mca_btl_uct_uct_completion_release (comp); - } - - uct_rkey_release (&rkey); - - return rc; -} - -int mca_btl_uct_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - /* this is static so it survives after this function returns. we don't care about the result */ - static uint64_t result; - - /* just use the fetching ops for now. there probably is a performance benefit to using - * the non-fetching on some platforms but this is easier to implement quickly and it - * guarantees remote completion. */ - return mca_btl_uct_afop (btl, endpoint, &result, remote_address, NULL, remote_handle, op, - operand, flags, order, cbfunc, cbcontext, cbdata); -} - -int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); - mca_btl_uct_uct_completion_t *comp = NULL; - ucs_status_t ucs_status; - uct_rkey_bundle_t rkey; - uct_ep_h ep_handle; - int rc; - - if (cbfunc) { - comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, - cbfunc, cbcontext, cbdata); - if (OPAL_UNLIKELY(NULL == comp)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_uct_completion_release (comp); - return rc; - } - - mca_btl_uct_context_lock (context); - - if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { - ucs_status = uct_ep_atomic_cswap32 (ep_handle, (uint32_t) compare, (uint32_t) value, remote_address, - rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); - } else { - ucs_status = uct_ep_atomic_cswap64 (ep_handle, compare, value, remote_address, rkey.rkey, - (uint64_t *) local_address, &comp->uct_comp); - } - - /* go ahead and progress the worker while we have the lock */ - (void) uct_worker_progress (context->uct_worker); - - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - - if (UCS_INPROGRESS == ucs_status) { - rc = OPAL_SUCCESS; - } else if (UCS_OK == ucs_status) { - rc = 1; - mca_btl_uct_uct_completion_release (comp); - } else { - rc = OPAL_ERR_OUT_OF_RESOURCE; - mca_btl_uct_uct_completion_release (comp); - } - - uct_rkey_release (&rkey); - - return rc; -} diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c deleted file mode 100644 index c8bc9e93775..00000000000 --- a/opal/mca/btl/uct/btl_uct_component.c +++ /dev/null @@ -1,566 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "opal_config.h" - -#include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/base.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/util/argv.h" -#include "opal/memoryhooks/memory.h" -#include "opal/mca/memory/base/base.h" -#include - -#include - -#include "btl_uct_device_context.h" -#include "btl_uct_am.h" - -static int mca_btl_uct_component_register(void) -{ - mca_btl_uct_module_t *module = &mca_btl_uct_module_template; - - mca_btl_uct_component.memory_domains = "none"; - (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, - "memory_domains", "Comma-delimited list of memory domains of the form " - "to use for communication. Memory domains MUST provide transports that " - "support put, get, and amos. Special values: all (all available), none." - " (default: none)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &mca_btl_uct_component.memory_domains); - - mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any"; - (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, - "transports", "Comma-delimited list of transports to use sorted by increasing " - "priority. The list of transports available can be queried using ucx_info. Special" - "values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); - - mca_btl_uct_component.num_contexts_per_module = 0; - (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, - "num_contexts_per_module", "Number of UCT worker contexts " - "to create for each BTL module. Larger numbers will improve " - "multi-threaded performance but may increase memory usage. " - "A good rule of thumb is one context per application thread " - "that will be calling into MPI. (default: 0 -- autoselect " - "based on the number of cores)", MCA_BASE_VAR_TYPE_INT, - NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.num_contexts_per_module); - - mca_btl_uct_component.disable_ucx_memory_hooks = true; - (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, - "disable_ucx_memory_hooks", "Disable the munmap memory hook " - "inside UCX. These hooks are not necessary when using the " - "uct btl and tend to cause performance problems when using " - "multiple threads (default: true)", MCA_BASE_VAR_TYPE_BOOL, - NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.disable_ucx_memory_hooks); - - -#if OPAL_C_HAVE__THREAD_LOCAL - mca_btl_uct_component.bind_threads_to_contexts = true; - (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, - "bind_threads_to_contexts", "Bind threads to device contexts. " - "In general this should improve the multi-threaded performance " - "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, - NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts); -#endif - - /* for now we want this component to lose to btl/ugni and btl/vader */ - module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; - - return mca_btl_base_param_register (&mca_btl_uct_component.super.btl_version, - &module->super); -} - -static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) -{ - ucm_vm_munmap(buf, length); -} - -static int mca_btl_uct_component_open(void) -{ - if (0 == mca_btl_uct_component.num_contexts_per_module) { - /* use the core count and the number of local processes to determine - * how many UCT workers to create */ - int core_count = 36; - - (void) opal_hwloc_base_get_topology (); - core_count = hwloc_get_nbobjs_by_type (opal_hwloc_topology, HWLOC_OBJ_CORE); - - if (core_count <= opal_process_info.num_local_peers || !opal_using_threads()) { - /* there is probably no benefit to using multiple device contexts when not - * using threads or oversubscribing the node with mpi processes. */ - mca_btl_uct_component.num_contexts_per_module = 1; - } else { - mca_btl_uct_component.num_contexts_per_module = core_count / (opal_process_info.num_local_peers + 1); - } - } - - if (mca_btl_uct_component.num_contexts_per_module > MCA_BTL_UCT_MAX_WORKERS) { - mca_btl_uct_component.num_contexts_per_module = MCA_BTL_UCT_MAX_WORKERS; - } - - if (mca_btl_uct_component.disable_ucx_memory_hooks) { - ucm_set_external_event(UCM_EVENT_VM_UNMAPPED); - opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL); - } - - return OPAL_SUCCESS; -} - - -/* - * component cleanup - sanity checking of queue lengths - */ -static int mca_btl_uct_component_close(void) -{ - if (mca_btl_uct_component.disable_ucx_memory_hooks) { - opal_mem_hooks_unregister_release (mca_btl_uct_mem_release_cb); - } - - return OPAL_SUCCESS; -} - -static size_t mca_btl_uct_tl_modex_size (mca_btl_uct_tl_t *tl) -{ - const size_t size = strlen (tl->uct_tl_name) + 1; - - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - /* pad out to a multiple of 4 bytes */ - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len + MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len) & ~3; - } - - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len) & ~3; -} - -static size_t mca_btl_uct_module_modex_size (mca_btl_uct_module_t *module) -{ - size_t modex_size = 4 + strlen (module->md_name) + 1; - - if (module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size (module->rdma_tl); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size (module->am_tl); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_size += mca_btl_uct_tl_modex_size (module->conn_tl); - } - - return modex_size; -} - -static size_t mca_btl_uct_tl_modex_pack (mca_btl_uct_tl_t *tl, uint8_t *modex_data) -{ - mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0]; - size_t modex_size = mca_btl_uct_tl_modex_size (tl); - - *((uint32_t *) modex_data) = (uint32_t) modex_size; - modex_data += 4; - - strcpy ((char *) modex_data, tl->uct_tl_name); - modex_data += strlen (tl->uct_tl_name) + 1; - - /* NTH: only the first context is available. i assume the device addresses of the - * contexts will be the same but they will have different iface addresses. i also - * am assuming that it doesn't really matter if all remote contexts connect to - * the same endpoint since we are only doing RDMA. if any of these assumptions are - * wrong then we can't delay creating the other contexts and must include their - * information in the modex. */ - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - uct_iface_get_address (dev_context->uct_iface, (uct_iface_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len; - } - - uct_iface_get_device_address (dev_context->uct_iface, (uct_device_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len; - - return modex_size; -} - -static int mca_btl_uct_modex_send (void) -{ - size_t modex_size = sizeof (mca_btl_uct_modex_t); - mca_btl_uct_modex_t *modex; - uint8_t *modex_data; - int rc; - - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - modex_size += mca_btl_uct_module_modex_size (mca_btl_uct_component.modules[i]); - } - - modex = alloca (modex_size); - modex_data = modex->data; - - modex->module_count = mca_btl_uct_component.module_count; - - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen (module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size (module); - - modex_data += 4; - - strcpy ((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack (module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack (module->am_tl, modex_data); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack (module->conn_tl, modex_data); - } - } - - OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); - return rc; -} - -static mca_btl_uct_module_t *mca_btl_uct_alloc_module (const char *md_name, mca_btl_uct_md_t *md, - size_t registration_size) -{ - mca_btl_uct_module_t *module; - ucs_status_t ucs_status; - - module = malloc (sizeof (*module)); - if (NULL == module) { - return NULL; - } - - /* copy the module template */ - *module = mca_btl_uct_module_template; - - OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); - OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); - OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&module->pending_connection_reqs, opal_fifo_t); - - module->md = md; - module->md_name = strdup (md_name); - module->super.btl_registration_handle_size = registration_size; - - ucs_status = ucs_async_context_create (UCS_ASYNC_MODE_THREAD, &module->ucs_async); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Could not create a UCT async context")); - mca_btl_uct_finalize (&module->super); - return NULL; - } - - return module; -} - -ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags) -{ - mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; - mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; - mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; - mca_btl_active_message_callback_t *reg; - mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof (*header))}, - .seg_len = length - sizeof (*header)}; - mca_btl_uct_base_frag_t frag = {.base = {.des_segments = &seg, .des_segment_count = 1}}; - - /* prevent recursion */ - tl_context->in_am_callback = true; - - reg = mca_btl_base_active_message_trigger + header->data.tag; - reg->cbfunc (&uct_btl->super, header->data.tag, &frag.base, reg->cbdata); - - tl_context->in_am_callback = false; - - return UCS_OK; -} - -static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc, char **allowed_ifaces) -{ - mca_rcache_base_resources_t rcache_resources; - uct_tl_resource_desc_t *tl_desc; - mca_btl_uct_module_t *module; - uct_md_config_t *uct_config; - uct_md_attr_t md_attr; - mca_btl_uct_md_t *md; - bool found = false; - unsigned num_tls; - char *tmp; - - if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { - BTL_VERBOSE(("created the maximum number of allowable modules")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); - - for (int j = 0 ; allowed_ifaces[j] ; ++j) { - if (0 == strncmp (allowed_ifaces[j], md_desc->md_name, strlen (md_desc->md_name)) || - 0 == strcmp (allowed_ifaces[j], "all")) { - found = true; - break; - } - } - - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; - } - - md = OBJ_NEW(mca_btl_uct_md_t); - - uct_md_config_read (md_desc->md_name, NULL, NULL, &uct_config); - uct_md_open (md_desc->md_name, uct_config, &md->uct_md); - uct_config_release (uct_config); - - uct_md_query (md->uct_md, &md_attr); - uct_md_query_tl_resources (md->uct_md, &tl_desc, &num_tls); - - module = mca_btl_uct_alloc_module (md_desc->md_name, md, md_attr.rkey_packed_size); - if (NULL == module) { - uct_release_tl_resource_list (tl_desc); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - (void) mca_btl_uct_query_tls (module, md, tl_desc, num_tls); - - uct_release_tl_resource_list (tl_desc); - - /* release the initial reference to the md object. if any modules were created the UCT md will remain - * open until those modules are finalized. */ - OBJ_RELEASE(md); - - if (NULL == module->am_tl && NULL == module->rdma_tl) { - BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); - mca_btl_uct_finalize (&module->super); - return OPAL_ERR_NOT_AVAILABLE; - } - - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) asprintf (&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof (mca_btl_uct_reg_t) + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources); - free (tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize (&module->super); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/* - * UCT component initialization: - * (1) read interface list from kernel and compare against component parameters - * then create a BTL instance for selected interfaces - * (2) setup UCT listen socket for incoming connection attempts - * (3) register BTL parameters with the MCA - */ - -static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ - struct mca_btl_base_module_t **base_modules; - uct_md_resource_desc_t *resources; - unsigned resource_count; - char **allowed_ifaces; - int rc; - - BTL_VERBOSE(("initializing uct btl")); - - if (NULL == mca_btl_uct_component.memory_domains || 0 == strlen (mca_btl_uct_component.memory_domains) || - 0 == strcmp (mca_btl_uct_component.memory_domains, "none")) { - BTL_VERBOSE(("no uct memory domains specified")); - return NULL; - } - - allowed_ifaces = opal_argv_split (mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { - return NULL; - } - - uct_query_md_resources (&resources, &resource_count); - - mca_btl_uct_component.module_count = 0; - - /* generate all suitable btl modules */ - for (unsigned i = 0 ; i < resource_count ; ++i) { - rc = mca_btl_uct_component_process_uct_md (resources + i, allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } - } - - opal_argv_free (allowed_ifaces); - uct_release_md_resource_list (resources); - - mca_btl_uct_modex_send (); - - /* pass module array back to caller */ - base_modules = calloc (mca_btl_uct_component.module_count, sizeof (*base_modules)); - if (NULL == base_modules) { - return NULL; - } - - memcpy (base_modules, mca_btl_uct_component.modules, mca_btl_uct_component.module_count * - sizeof (mca_btl_uct_component.modules[0])); - - *num_btl_modules = mca_btl_uct_component.module_count; - - BTL_VERBOSE(("uct btl initialization complete. found %d suitable memory domains", - mca_btl_uct_component.module_count)); - - return base_modules; -} - -static int mca_btl_uct_tl_progress (mca_btl_uct_tl_t *tl, int starting_index) -{ - unsigned int ret = 0; - - if (NULL == tl) { - return 0; - } - - for (int j = 0 ; j < tl->max_device_contexts ; ++j) { - if (tl->uct_dev_contexts[j]) { - ret += mca_btl_uct_context_progress (tl->uct_dev_contexts[j]); - } - } - - return ret; -} - -static int mca_btl_uct_component_progress_pending (mca_btl_uct_module_t *uct_btl) -{ - mca_btl_uct_base_frag_t *frag, *next; - size_t count; - - if (0 == (count = opal_list_get_size (&uct_btl->pending_frags))) { - return 0; - } - - OPAL_THREAD_LOCK(&uct_btl->lock); - OPAL_LIST_FOREACH_SAFE(frag, next, &uct_btl->pending_frags, mca_btl_uct_base_frag_t) { - if (!frag->ready) { - continue; - } - - opal_list_remove_item (&uct_btl->pending_frags, (opal_list_item_t *) frag); - - if (OPAL_SUCCESS > mca_btl_uct_send_frag (uct_btl, frag, false)) { - opal_list_prepend (&uct_btl->pending_frags, (opal_list_item_t *) frag); - } - } - OPAL_THREAD_UNLOCK(&uct_btl->lock); - - return OPAL_SUCCESS; -} - -/** - * @brief UCT BTL progress function - * - * This function explictly progresses all workers. - */ -static int mca_btl_uct_component_progress (void) -{ - int starting_index = mca_btl_uct_get_context_index (); - unsigned ret = 0; - - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - - /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" - * by the btl progress functions.... */ - ret += mca_btl_uct_tl_progress (module->rdma_tl, starting_index); - - if (module->am_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress (module->am_tl, starting_index); - } - - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress (module->conn_tl, 0); - } - - while (NULL != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic (&module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request (module, (mca_btl_uct_conn_req_t *) request->request_data); - OBJ_RELEASE(request); - } - } - - if (0 != opal_list_get_size (&module->pending_frags)) { - mca_btl_uct_component_progress_pending (module); - } - } - - return (int) ret; -} - -/** UCT btl component */ -mca_btl_uct_component_t mca_btl_uct_component = { - .super = { - .btl_version = { - MCA_BTL_DEFAULT_VERSION("uct"), - .mca_open_component = mca_btl_uct_component_open, - .mca_close_component = mca_btl_uct_component_close, - .mca_register_component_params = mca_btl_uct_component_register, - }, - .btl_data = { - /* The component is not checkpoint ready */ - .param_field = MCA_BASE_METADATA_PARAM_NONE - }, - - .btl_init = mca_btl_uct_component_init, - .btl_progress = mca_btl_uct_component_progress, - } -}; diff --git a/opal/mca/btl/uct/btl_uct_device_context.h b/opal/mca/btl/uct/btl_uct_device_context.h deleted file mode 100644 index 12ef1e1f42c..00000000000 --- a/opal/mca/btl/uct/btl_uct_device_context.h +++ /dev/null @@ -1,160 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(BTL_UCT_DEVICE_CONTEXT_H) -#define BTL_UCT_DEVICE_CONTEXT_H - -#include "btl_uct.h" -#include "btl_uct_rdma.h" -#include "btl_uct_frag.h" - -/** - * @brief Create a new device context for the given transport - * - * @param[in] module btl uct module - * @param[in] tl btl uct tl pointer - * @param[in] context_id identifier for this context (0..MCA_BTL_UCT_MAX_WORKERS-1) - */ -mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress); - -/** - * @brief Destroy a device context and release all resources - * - * @param[in] context btl uct device context - * - * This call frees a device context and all assoicated resources. It is not - * valid to use the device context after this returns. - */ -void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context); - -static inline bool mca_btl_uct_context_trylock (mca_btl_uct_device_context_t *context) -{ - return OPAL_THREAD_TRYLOCK(&context->mutex); -} - -static inline void mca_btl_uct_context_lock (mca_btl_uct_device_context_t *context) -{ - OPAL_THREAD_LOCK (&context->mutex); -} - -static inline void mca_btl_uct_context_unlock (mca_btl_uct_device_context_t *context) -{ - OPAL_THREAD_UNLOCK (&context->mutex); -} - -#define MCA_BTL_UCT_CONTEXT_SERIALIZE(context,code) \ - do { \ - mca_btl_uct_context_lock (context); \ - code; \ - mca_btl_uct_context_unlock(context); \ - } while (0); - -static inline int mca_btl_uct_get_context_index (void) -{ - static volatile uint32_t next_uct_index = 0; - int context_id; - -#if OPAL_C_HAVE__THREAD_LOCAL - if (mca_btl_uct_component.bind_threads_to_contexts) { - static _Thread_local int uct_index = -1; - - context_id = uct_index; - if (OPAL_UNLIKELY(-1 == context_id)) { - context_id = uct_index = opal_atomic_fetch_add_32 ((volatile int32_t *) &next_uct_index, 1) % - mca_btl_uct_component.num_contexts_per_module; - } - } else { -#endif - /* avoid using atomics in this. i doubt it improves performance to ensure atomicity on the next - * index in this case. */ - context_id = next_uct_index++ % mca_btl_uct_component.num_contexts_per_module; -#if OPAL_C_HAVE__THREAD_LOCAL - } -#endif - - return context_id; -} - -static inline mca_btl_uct_device_context_t * -mca_btl_uct_module_get_tl_context_specific (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id) -{ - mca_btl_uct_device_context_t *context = tl->uct_dev_contexts[context_id]; - - if (OPAL_UNLIKELY(NULL == context)) { - OPAL_THREAD_LOCK(&module->lock); - context = tl->uct_dev_contexts[context_id]; - if (OPAL_UNLIKELY(NULL == context)) { - context = tl->uct_dev_contexts[context_id] = mca_btl_uct_context_create (module, tl, context_id, true); - } - OPAL_THREAD_UNLOCK(&module->lock); - } - - return context; -} - -static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_rdma_context (mca_btl_uct_module_t *module) -{ - return mca_btl_uct_module_get_tl_context_specific (module, module->rdma_tl, mca_btl_uct_get_context_index ()); -} - -static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_rdma_context_specific (mca_btl_uct_module_t *module, int context_id) -{ - return mca_btl_uct_module_get_tl_context_specific (module, module->rdma_tl, context_id); -} - -static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_am_context (mca_btl_uct_module_t *module) -{ - return mca_btl_uct_module_get_tl_context_specific (module, module->am_tl, mca_btl_uct_get_context_index ()); -} - -static inline void mca_btl_uct_device_handle_completions (mca_btl_uct_device_context_t *dev_context) -{ - mca_btl_uct_uct_completion_t *comp; - - while (NULL != (comp = (mca_btl_uct_uct_completion_t *) opal_fifo_pop (&dev_context->completion_fifo))) { - int rc = UCS_OK == comp->status ? OPAL_SUCCESS : OPAL_ERROR; - - if (comp->frag) { - /* reset the count */ - comp->uct_comp.count = 1; - mca_btl_uct_frag_complete (comp->frag, rc); - - continue; - } - - /* we may be calling the callback before remote completion. this is in violation of the - * btl interface specification but should not hurt in non-ob1 use cases. if this ever - * becomes a problem we can look at possible solutions. */ - comp->cbfunc (comp->btl, comp->endpoint, comp->local_address, comp->local_handle, - comp->cbcontext, comp->cbdata, rc); - mca_btl_uct_uct_completion_release (comp); - } -} - -static inline int mca_btl_uct_context_progress (mca_btl_uct_device_context_t *context) -{ - int ret = 0; - - if (!context->uct_worker) { - return 0; - } - - if (!mca_btl_uct_context_trylock (context)) { - ret = uct_worker_progress (context->uct_worker); - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - } - - return ret; -} - -#endif /* BTL_UCT_DEVICE_CONTEXT_H */ diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c deleted file mode 100644 index 40349673e27..00000000000 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ /dev/null @@ -1,409 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct.h" -#include "btl_uct_endpoint.h" -#include "btl_uct_device_context.h" -#include "btl_uct_am.h" -#include "opal/util/proc.h" - -static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint) -{ - memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module); - endpoint->conn_ep = NULL; - OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); -} - -static void mca_btl_uct_endpoint_destruct (mca_btl_uct_endpoint_t *endpoint) -{ - for (int tl_index = 0 ; tl_index < 2 ; ++tl_index) { - for (int i = 0 ; i < mca_btl_uct_component.num_contexts_per_module ; ++i) { - if (NULL != endpoint->uct_eps[i][tl_index].uct_ep) { - uct_ep_destroy (endpoint->uct_eps[i][tl_index].uct_ep); - } - } - } - - OBJ_DESTRUCT(&endpoint->ep_lock); -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t, - mca_btl_uct_endpoint_construct, - mca_btl_uct_endpoint_destruct); - -mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc) -{ - mca_btl_uct_endpoint_t *endpoint = calloc (1, sizeof (*endpoint) + sizeof (endpoint->uct_eps[0]) * - mca_btl_uct_component.num_contexts_per_module); - - if (OPAL_UNLIKELY(NULL == endpoint)) { - return NULL; - } - - OBJ_CONSTRUCT(endpoint, mca_btl_uct_endpoint_t); - endpoint->ep_proc = proc; - - return (mca_btl_base_endpoint_t *) endpoint; -} - -static unsigned char *mca_btl_uct_process_modex_tl (unsigned char *modex_data) -{ - BTL_VERBOSE(("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data))); - - /* skip size and name */ - return modex_data + 4 + strlen ((char *) modex_data + 4) + 1; -} - -static void mca_btl_uct_process_modex (mca_btl_uct_module_t *uct_btl, unsigned char *modex_data, - unsigned char **rdma_tl_data, unsigned char **am_tl_data, - unsigned char **conn_tl_data) -{ - BTL_VERBOSE(("processing remote modex data")); - - if (uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains RDMA data")); - if (rdma_tl_data) { - *rdma_tl_data = mca_btl_uct_process_modex_tl (modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (rdma_tl_data) { - *rdma_tl_data = NULL; - } - - if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains active message data")); - if (am_tl_data) { - *am_tl_data = mca_btl_uct_process_modex_tl (modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (am_tl_data) { - *am_tl_data = NULL; - } - - if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl && uct_btl->conn_tl != uct_btl->am_tl) { - BTL_VERBOSE(("modex contains connection data")); - if (conn_tl_data) { - *conn_tl_data = mca_btl_uct_process_modex_tl (modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (conn_tl_data) { - *conn_tl_data = NULL; - } -} - -static int mca_btl_uct_endpoint_connect_iface (mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, - mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data) -{ - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; - - /* easy case. just connect to the interface */ - iface_addr = (uct_iface_addr_t *) tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).iface_addr_len); - - BTL_VERBOSE(("connecting endpoint to interface")); - - mca_btl_uct_context_lock (tl_context); - ucs_status = uct_ep_create_connected (tl_context->uct_iface, device_addr, iface_addr, &tl_endpoint->uct_ep); - tl_endpoint->flags = MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY; - mca_btl_uct_context_unlock (tl_context); - - return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; -} - -static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t *ep) -{ - ep->uct_ep = NULL; -} - -static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep) -{ - if (ep->uct_ep) { - uct_ep_destroy (ep->uct_ep); - ep->uct_ep = NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, - mca_btl_uct_connection_ep_destruct); - -struct mca_btl_uct_conn_completion_t { - uct_completion_t super; - volatile bool complete; -}; -typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t; - -static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status) -{ - mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self; - BTL_VERBOSE(("connection flush complete")); - completion->complete = true; -} - -static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_device_context_t *conn_tl_context, - mca_btl_uct_conn_req_t *request, size_t request_length) -{ - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; - mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, - .complete = false}; - ucs_status_t ucs_status; - - BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, - request->context_id, request->type, request_length)); - - OBJ_RETAIN(endpoint->conn_ep); - - /* need to drop the lock to avoid hold-and-wait */ - opal_mutex_unlock (&endpoint->ep_lock); - - do { - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { - ucs_status = uct_ep_am_short (conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request, - request_length); - }); - if (OPAL_LIKELY(UCS_OK == ucs_status)) { - break; - } - - if (OPAL_UNLIKELY(UCS_ERR_NO_RESOURCE != ucs_status)) { - return OPAL_ERROR; - } - - /* some TLs (UD for example) need to be progressed to get resources */ - mca_btl_uct_context_progress (conn_tl_context); - } while (1); - - /* for now we just wait for the connection request to complete before continuing */ - ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super); - if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { - /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ - do { - ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL); - mca_btl_uct_context_progress (conn_tl_context); - } while (UCS_INPROGRESS == ucs_status); - } else { - do { - mca_btl_uct_context_progress (conn_tl_context); - } while (!completion.complete); - } - - opal_mutex_lock (&endpoint->ep_lock); - - OBJ_RELEASE(endpoint->conn_ep); - - return OPAL_SUCCESS; -} - -static int mca_btl_uct_endpoint_connect_endpoint (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, - mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, - uint8_t *conn_tl_data, void *ep_addr) -{ - size_t request_length = sizeof (mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - mca_btl_uct_conn_req_t *request = alloca (request_length); - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; - int rc; - - assert (NULL != conn_tl); - - BTL_VERBOSE(("connecting endpoint to remote endpoint")); - - if (NULL == conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print (endpoint->ep_proc->proc_name))); - - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); - - endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { - ucs_status = uct_ep_create_connected (conn_tl_context->uct_iface, device_addr, iface_addr, - &conn_ep->uct_ep); - }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(conn_ep); - } - - /* fill in common request parameters */ - request->proc_name = OPAL_PROC_MY_NAME; - request->context_id = tl_context->context_id; - request->tl_index = tl->tl_index; - request->type = !!(ep_addr); - - if (NULL == tl_endpoint->uct_ep) { - BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data", - opal_process_name_print (endpoint->ep_proc->proc_name))); - - MCA_BTL_UCT_CONTEXT_SERIALIZE(tl_context, { - ucs_status = uct_ep_create (tl_context->uct_iface, &tl_endpoint->uct_ep); - }); - if (UCS_OK != ucs_status) { - OBJ_RELEASE(endpoint->conn_ep); - return OPAL_ERROR; - } - } - - if (ep_addr) { - BTL_VERBOSE(("using remote endpoint address to connect endpoint for tl %s, index %d. ep_addr = %p", - tl->uct_tl_name, tl_context->context_id, ep_addr)); - - /* NTH: there is no need to lock the device context in this case */ - ucs_status = uct_ep_connect_to_ep (tl_endpoint->uct_ep, (uct_device_addr_t *) tl_data, ep_addr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - } - - /* fill in connection request */ - ucs_status = uct_ep_get_address (tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); - if (UCS_OK != ucs_status) { - /* this is a fatal a fatal error */ - OBJ_RELEASE(endpoint->conn_ep); - uct_ep_destroy (tl_endpoint->uct_ep); - tl_endpoint->uct_ep = NULL; - return OPAL_ERROR; - } - - /* let the remote side know that the connection has been established and - * wait for the message to be sent */ - rc = mca_btl_uct_endpoint_send_conn_req (uct_btl, endpoint, conn_tl_context, request, request_length); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OBJ_RELEASE(endpoint->conn_ep); - uct_ep_destroy (tl_endpoint->uct_ep); - tl_endpoint->uct_ep = NULL; - return OPAL_ERROR; - } - - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS : OPAL_ERR_OUT_OF_RESOURCE; -} - -int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, - void *ep_addr, int tl_index) -{ - mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index; - mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl; - mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id); - uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; - mca_btl_uct_connection_ep_t *conn_ep = NULL; - mca_btl_uct_modex_t *modex; - uint8_t *modex_data; - size_t msg_size; - int rc; - - /* only two types of endpoints at this time */ - assert (tl_index < 2); - - if (OPAL_UNLIKELY(NULL == tl)) { - return OPAL_ERR_UNREACH; - } - - BTL_VERBOSE(("checking endpoint %p with context id %d. cached uct ep: %p, ready: %d", (void *) endpoint, context_id, - (void *) tl_endpoint->uct_ep, !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); - - opal_mutex_lock (&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock (&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock (&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - do { - /* read the modex. this is done both to start the connection and to process endpoint data */ - OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, - &endpoint->ep_proc->proc_name, (void **)&modex, &msg_size); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("error receiving modex")); - break; - } - - BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, - OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0 ; i < modex->module_count ; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp ((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - - modex_data += strlen ((char *) modex_data) + 1; - - mca_btl_uct_process_modex (uct_btl, modex_data, &rdma_tl_data, &am_tl_data, &conn_tl_data); - break; - } - - tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - - if (NULL == tl_data) { - opal_mutex_unlock (&endpoint->ep_lock); - return OPAL_ERR_UNREACH; - } - - /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl (tl)) { - rc = mca_btl_uct_endpoint_connect_iface (uct_btl, tl, tl_context, tl_endpoint, tl_data); - } else { - rc = mca_btl_uct_endpoint_connect_endpoint (uct_btl, endpoint, tl, tl_context, tl_endpoint, - tl_data, conn_tl_data, ep_addr); - } - - } while (0); - - /* to avoid a possible hold-and wait deadlock. destroy the endpoint after dropping the endpoint lock. */ - if (endpoint->conn_ep && 1 == endpoint->conn_ep->super.obj_reference_count) { - conn_ep = endpoint->conn_ep; - endpoint->conn_ep = NULL; - } - - opal_mutex_unlock (&endpoint->ep_lock); - - if (conn_ep) { - OBJ_RELEASE(conn_ep); - } - - BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet")); - - return rc; -} diff --git a/opal/mca/btl/uct/btl_uct_endpoint.h b/opal/mca/btl/uct/btl_uct_endpoint.h deleted file mode 100644 index 6add6f27193..00000000000 --- a/opal/mca/btl/uct/btl_uct_endpoint.h +++ /dev/null @@ -1,95 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_UCT_ENDPOINT_H -#define MCA_BTL_UCT_ENDPOINT_H - -#include "opal/class/opal_list.h" -#include "opal/mca/event/event.h" -#include "btl_uct.h" - -BEGIN_C_DECLS - -mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc); -int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, int ep_index, void *ep_addr, int tl_index); - -static inline int mca_btl_uct_endpoint_test_am (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, - mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) -{ - int tl_index = module->am_tl->tl_index; - int ep_index = context->context_id; - - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[ep_index][tl_index].flags)) { - *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; - return OPAL_SUCCESS; - } - - return OPAL_ERR_NOT_AVAILABLE; -} - -/** - * @brief Check if the endpoint is connected and start the connection if not - * - * @param[in] module UCT BTL module - * @param[in] endpoint UCT BTL endpoint - * @param[in] context UCT BTL device context - * @param[out] ep_handle UCT endpoint handle - * @param[in] tl_index UCT TL index (0 or 1) - * - * @returns OPAL_SUCCESS if the endpoint is connected and ready to us - * @returns OPAL_ERR_RESOURCE_BUSY if the connection is underway - * @returns OPAL_ERROR otherwise - */ -static inline int mca_btl_uct_endpoint_check (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, - mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle, - const int tl_index) -{ - int ep_index = context->context_id; - int rc; - - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[ep_index][tl_index].flags)) { - *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; - return OPAL_SUCCESS; - } - - rc = mca_btl_uct_endpoint_connect (module, endpoint, ep_index, NULL, tl_index); - *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; - BTL_VERBOSE(("mca_btl_uct_endpoint_connect returned %d. context id = %d, flags = 0x%x", rc, ep_index, - MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[ep_index][tl_index].flags)); - return rc; -} - -static inline int mca_btl_uct_endpoint_check_rdma (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, - mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) -{ - assert (NULL != module->rdma_tl); - return mca_btl_uct_endpoint_check (module, endpoint, context, ep_handle, module->rdma_tl->tl_index); -} - -static inline int mca_btl_uct_endpoint_check_am (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, - mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) -{ - assert (NULL != module->am_tl); - return mca_btl_uct_endpoint_check (module, endpoint, context, ep_handle, module->am_tl->tl_index); -} - -END_C_DECLS -#endif diff --git a/opal/mca/btl/uct/btl_uct_frag.c b/opal/mca/btl/uct/btl_uct_frag.c deleted file mode 100644 index 3e5622cac45..00000000000 --- a/opal/mca/btl/uct/btl_uct_frag.c +++ /dev/null @@ -1,55 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct_frag.h" - -static void mca_btl_uct_frag_completion (uct_completion_t *uct_comp, ucs_status_t status) -{ - mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) ((uintptr_t) uct_comp - offsetof (mca_btl_uct_uct_completion_t, uct_comp)); - - BTL_VERBOSE(("frag operation complete. frag = %p. status = %d", (void *) comp->frag, status)); - - comp->status = status; - opal_fifo_push (&comp->dev_context->completion_fifo, &comp->super.super); -} - -static void mca_btl_uct_base_frag_constructor (mca_btl_uct_base_frag_t *frag) -{ - mca_btl_uct_reg_t *reg = (mca_btl_uct_reg_t *) frag->base.super.registration; - - /* zero everything out */ - memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - - OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t); - - frag->base.des_segments = frag->segments; - frag->base.des_segment_count = 1; - - frag->comp.uct_comp.func = mca_btl_uct_frag_completion; - frag->comp.uct_comp.count = 1; - frag->comp.frag = frag; - - frag->segments[0].seg_addr.pval = frag->base.super.ptr; - frag->uct_iov.buffer = frag->base.super.ptr; - frag->uct_iov.stride = 0; - frag->uct_iov.count = 1; - if (reg) { - frag->uct_iov.memh = reg->uct_memh; - } -} - -static void mca_btl_uct_base_frag_destructor (mca_btl_uct_base_frag_t *frag) -{ - OBJ_DESTRUCT(&frag->comp); -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_base_frag_t, mca_btl_base_descriptor_t, - mca_btl_uct_base_frag_constructor, mca_btl_uct_base_frag_destructor); diff --git a/opal/mca/btl/uct/btl_uct_frag.h b/opal/mca/btl/uct/btl_uct_frag.h deleted file mode 100644 index 8aa8789d0e3..00000000000 --- a/opal/mca/btl/uct/btl_uct_frag.h +++ /dev/null @@ -1,63 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(MCA_BTL_UCT_FRAG_H) -#define MCA_BTL_UCT_FRAG_H - -#include "btl_uct.h" - -static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc (mca_btl_uct_module_t *uct_btl, opal_free_list_t *fl, - mca_btl_base_endpoint_t *endpoint) -{ - mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) opal_free_list_get (fl); - if (OPAL_LIKELY(NULL != frag)) { - frag->free_list = fl; - frag->endpoint = endpoint; - frag->btl = uct_btl; - } - - return frag; -} - -static inline void mca_btl_uct_frag_return (mca_btl_uct_base_frag_t *frag) -{ - opal_free_list_return (frag->free_list, &frag->base.super); -} - -static inline void mca_btl_uct_frag_complete (mca_btl_uct_base_frag_t *frag, int rc) { - mca_btl_uct_module_t *uct_btl = frag->btl; - - /* call callback if specified */ - if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - frag->base.des_cbfunc(&uct_btl->super, frag->endpoint, &frag->base, rc); - } - - if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { - mca_btl_uct_frag_return (frag); - } -} - -static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_short (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) -{ - return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->short_frags, endpoint); -} - -static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_eager (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) -{ - return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->eager_frags, endpoint); -} - -static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_max (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) -{ - return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->max_frags, endpoint); -} - -#endif /* !defined(MCA_BTL_UCT_FRAG_H) */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c deleted file mode 100644 index f0802867546..00000000000 --- a/opal/mca/btl/uct/btl_uct_module.c +++ /dev/null @@ -1,365 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" -#include -#include "opal/class/opal_bitmap.h" -#include "opal/mca/btl/btl.h" -#include "opal/datatype/opal_convertor.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/mpool.h" - -#include "btl_uct.h" -#include "btl_uct_endpoint.h" -#include "btl_uct_am.h" - -struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) module; - mca_btl_base_endpoint_t *ep; - int rc; - - opal_mutex_lock (&uct_module->endpoint_lock); - - do { - rc = opal_hash_table_get_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) proc, (void **) &ep); - if (OPAL_SUCCESS == rc) { - BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name))); - break; - } - - /* Create and Init endpoints */ - ep = mca_btl_uct_endpoint_create (proc); - if (OPAL_UNLIKELY(NULL == ep)) { - BTL_ERROR(("btl/uct error initializing endpoint")); - break; - } - - BTL_VERBOSE(("endpoint initialized. new endpoint: %p", (void *) ep)); - - /* add this endpoint to the connection lookup table */ - (void) opal_hash_table_set_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) proc, ep); - } while (0); - - opal_mutex_unlock (&uct_module->endpoint_lock); - - return ep; -} - -static int mca_btl_uct_add_procs (mca_btl_base_module_t *btl, - size_t nprocs, opal_proc_t **opal_procs, - mca_btl_base_endpoint_t **peers, - opal_bitmap_t *reachable) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; - int rc; - - if (false == uct_module->initialized) { - mca_btl_uct_tl_t *am_tl = uct_module->am_tl; - - /* NTH: might want to vary this size based off the universe size (if - * one exists). the table is only used for connection lookup and - * endpoint removal. */ - rc = opal_hash_table_init (&uct_module->id_to_endpoint, 512); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("error initializing the endpoint hash. rc = %d", rc)); - return rc; - } - - if (am_tl) { - rc = opal_free_list_init (&uct_module->short_frags, sizeof (mca_btl_uct_base_frag_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - MCA_BTL_UCT_TL_ATTR(am_tl, 0).cap.am.max_short, opal_cache_line_size, - 0, 1024, 64, NULL, 0, NULL, NULL, NULL); - - rc = opal_free_list_init (&uct_module->eager_frags, sizeof (mca_btl_uct_base_frag_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - btl->btl_eager_limit, opal_cache_line_size, - 0, 1024, 64, NULL, 0, uct_module->rcache, NULL, NULL); - - rc = opal_free_list_init (&uct_module->max_frags, sizeof (mca_btl_uct_base_frag_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - btl->btl_max_send_size, opal_cache_line_size, 0, 128, 8, - NULL, 0, uct_module->rcache, NULL, NULL); - } - - uct_module->initialized = true; - } - - for (size_t i = 0 ; i < nprocs ; ++i) { - /* all endpoints are reachable for uct */ - peers[i] = mca_btl_uct_get_ep (btl, opal_procs[i]); - if (OPAL_UNLIKELY(NULL == peers[i])) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - opal_bitmap_set_bit(reachable, i); - } - - return OPAL_SUCCESS; -} - -static int mca_btl_uct_del_procs (mca_btl_base_module_t *btl, size_t nprocs, - opal_proc_t **procs, mca_btl_base_endpoint_t **peers) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; - mca_btl_base_endpoint_t *ep; - int rc; - - for (size_t i = 0 ; i < nprocs ; ++i) { - if (NULL == procs[i]) { - continue; - } - - rc = opal_hash_table_get_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) procs[i], (void **) &ep); - if (OPAL_SUCCESS != rc) { - continue; - } - - (void) opal_hash_table_remove_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) procs[i]); - OBJ_RELEASE(ep); - } - - return OPAL_SUCCESS; -} - - -/** - * @brief Register a memory region for put/get/atomic operations. - * - * @param btl (IN) BTL module - * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) - * @param base (IN) Pointer to start of region - * @param size (IN) Size of region - * @param flags (IN) Flags indicating what operation will be performed. Valid - * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, - * and MCA_BTL_DES_FLAGS_ATOMIC - * - * @returns a memory registration handle valid for both local and remote operations - * @returns NULL if the region could not be registered - * - * This function registers the specified region with the hardware for use with - * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop - * functions. Care should be taken to not hold an excessive number of registrations - * as they may use limited system/NIC resources. - */ -static struct mca_btl_base_registration_handle_t * -mca_btl_uct_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base, - size_t size, uint32_t flags) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; - mca_btl_uct_reg_t *reg; - int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; - int rc; - - rc = uct_module->rcache->rcache_register (uct_module->rcache, base, size, 0, access_flags, - (mca_rcache_base_registration_t **) ®); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return NULL; - } - - return ®->handle; -} - -/** - * @brief Deregister a memory region - * - * @param btl (IN) BTL module region was registered with - * @param handle (IN) BTL registration handle to deregister - * - * This function deregisters the memory region associated with the specified handle. Care - * should be taken to not perform any RDMA or atomic operation on this memory region - * after it is deregistered. It is erroneous to specify a memory handle associated with - * a remote node. - */ -static int mca_btl_uct_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; - mca_btl_uct_reg_t *reg = - (mca_btl_uct_reg_t *)((intptr_t) handle - offsetof (mca_btl_uct_reg_t, handle)); - - (void) uct_module->rcache->rcache_deregister (uct_module->rcache, ®->base); - - return OPAL_SUCCESS; -} - -int mca_btl_uct_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) reg_data; - mca_btl_uct_reg_t *uct_reg = (mca_btl_uct_reg_t *) reg; - ucs_status_t ucs_status; - int uct_flags = 0; - - BTL_VERBOSE(("attempting to register range {%p,%p} with uct", base, (char *) base + size)); - - if (MCA_BTL_REG_FLAG_REMOTE_READ & reg->access_flags) { - uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_GET; - } - if (MCA_BTL_REG_FLAG_REMOTE_WRITE & reg->access_flags) { - uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT; - } - if (MCA_BTL_REG_FLAG_REMOTE_ATOMIC & reg->access_flags) { - uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_ATOMIC; - } - - /* UCT barfs if there are no access flags */ - if (0 == uct_flags) { - uct_flags = UCT_MD_MEM_ACCESS_ALL; - } - - ucs_status = uct_md_mem_reg (uct_module->md->uct_md, base, size, uct_flags, &uct_reg->uct_memh); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Error registering memory with UCT. code: %d", ucs_status)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (reg->access_flags & (MCA_BTL_REG_FLAG_REMOTE_READ | MCA_BTL_REG_FLAG_REMOTE_WRITE | MCA_BTL_REG_FLAG_REMOTE_ATOMIC)) { - /* requested registration may be used by a remote process so go ahead and pack - * the registration handle */ - ucs_status = uct_md_mkey_pack (uct_module->md->uct_md, uct_reg->uct_memh, uct_reg->handle.packed_handle); - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - BTL_VERBOSE(("Could not pack remote key. code: %d", ucs_status)); - uct_md_mem_dereg (uct_module->md->uct_md, uct_reg->uct_memh); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - return OPAL_SUCCESS; -} - -int mca_btl_uct_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) reg_data; - mca_btl_uct_reg_t *uct_reg = (mca_btl_uct_reg_t *) reg; - - uct_md_mem_dereg (uct_module->md->uct_md, uct_reg->uct_memh); - - return OPAL_SUCCESS; -} - - -/* - * Cleanup/release module resources. - */ - -int mca_btl_uct_finalize (mca_btl_base_module_t* btl) -{ - mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; - mca_btl_uct_endpoint_t *endpoint; - uint64_t key; - - /* clean up any leftover endpoints */ - OPAL_HASH_TABLE_FOREACH(key, uint64, endpoint, &uct_module->id_to_endpoint) { - OBJ_RELEASE(endpoint); - } - OBJ_DESTRUCT(&uct_module->id_to_endpoint); - OBJ_DESTRUCT(&uct_module->short_frags); - OBJ_DESTRUCT(&uct_module->eager_frags); - OBJ_DESTRUCT(&uct_module->max_frags); - OBJ_DESTRUCT(&uct_module->pending_frags); - OBJ_DESTRUCT(&uct_module->lock); - OBJ_DESTRUCT(&uct_module->pending_connection_reqs); - - if (uct_module->rcache) { - mca_rcache_base_module_destroy (uct_module->rcache); - } - - if (NULL != uct_module->am_tl) { - OBJ_RELEASE(uct_module->am_tl); - } - - if (NULL != uct_module->conn_tl) { - OBJ_RELEASE(uct_module->conn_tl); - } - - if (NULL != uct_module->rdma_tl) { - OBJ_RELEASE(uct_module->rdma_tl); - } - - ucs_async_context_destroy (uct_module->ucs_async); - - OBJ_DESTRUCT(&uct_module->endpoint_lock); - - free (uct_module->md_name); - free (uct_module); - - return OPAL_SUCCESS; -} - -mca_btl_uct_module_t mca_btl_uct_module_template = { - .super = { - /* initialize functions. this btl only support RDMA and atomics - * for now so it does not provide prepare_src, alloc, free, or send */ - .btl_component = &mca_btl_uct_component.super, - .btl_add_procs = mca_btl_uct_add_procs, - .btl_del_procs = mca_btl_uct_del_procs, - .btl_finalize = mca_btl_uct_finalize, - .btl_put = mca_btl_uct_put, - .btl_get = mca_btl_uct_get, - .btl_register_mem = mca_btl_uct_register_mem, - .btl_deregister_mem = mca_btl_uct_deregister_mem, - .btl_atomic_op = mca_btl_uct_aop, - .btl_atomic_fop = mca_btl_uct_afop, - .btl_atomic_cswap = mca_btl_uct_acswap, - .btl_flush = mca_btl_uct_flush, - - .btl_sendi = mca_btl_uct_sendi, - .btl_prepare_src = mca_btl_uct_prepare_src, - .btl_send = mca_btl_uct_send, - .btl_alloc = mca_btl_uct_alloc, - .btl_free = mca_btl_uct_free, - - /* set the default flags for this btl. uct provides us with rdma and both - * fetching and non-fetching atomics (though limited to add and cswap) */ - .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS, - .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | - MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, - - /* set the default limits on put and get */ - .btl_put_limit = 1 << 23, - .btl_put_alignment = 0, - .btl_get_limit = 1 << 23, - .btl_get_alignment = 0, - - .btl_rndv_eager_limit = 8192, - .btl_rdma_pipeline_frag_size = 4 * 1024 * 1024, - .btl_rdma_pipeline_send_length = 8192, - .btl_eager_limit = 8192, - .btl_max_send_size = 65536, - } -}; - -OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); - -static void mca_btl_uct_md_construct (mca_btl_uct_md_t *md) -{ - md->uct_md = NULL; -} - -static void mca_btl_uct_md_destruct (mca_btl_uct_md_t *md) -{ - if (md->uct_md) { - uct_md_close (md->uct_md); - md->uct_md = NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_object_t, mca_btl_uct_md_construct, mca_btl_uct_md_destruct); diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c deleted file mode 100644 index 2d2d1c3f04b..00000000000 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ /dev/null @@ -1,308 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct_device_context.h" - -void mca_btl_uct_uct_completion (uct_completion_t *uct_comp, ucs_status_t status) -{ - mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) ((uintptr_t) uct_comp - offsetof (mca_btl_uct_uct_completion_t, uct_comp)); - - BTL_VERBOSE(("network operation complete. status = %d", status)); - - comp->status = status; - opal_fifo_push (&comp->dev_context->completion_fifo, &comp->super.super); -} - - -static void mca_btl_uct_uct_completion_construct (mca_btl_uct_uct_completion_t *comp) -{ - comp->frag = NULL; - comp->uct_comp.func = mca_btl_uct_uct_completion; -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t, mca_btl_uct_uct_completion_construct, NULL); - - -mca_btl_uct_uct_completion_t * -mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_uct_device_context_t *dev_context, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) -{ - mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) opal_free_list_get (&dev_context->rdma_completions); - if (OPAL_LIKELY(NULL != comp)) { - comp->uct_comp.count = 1; - comp->btl = &uct_btl->super; - comp->endpoint = endpoint; - comp->local_address = local_address; - comp->local_handle = local_handle; - comp->cbfunc = cbfunc; - comp->cbcontext = cbcontext; - comp->cbdata = cbdata; - comp->dev_context = dev_context; - } - - return comp; -} - -void mca_btl_uct_uct_completion_release (mca_btl_uct_uct_completion_t *comp) -{ - if (comp) { - opal_free_list_return (&comp->dev_context->rdma_completions, &comp->super); - } -} - -static void mca_btl_uct_get_unpack (void *arg, const void *data, size_t length) -{ - memcpy (arg, data, length); -} - -int mca_btl_uct_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); - mca_btl_uct_uct_completion_t *comp = NULL; - ucs_status_t ucs_status; - uct_rkey_bundle_t rkey; - uct_ep_h ep_handle; - int rc; - - BTL_VERBOSE(("performing get operation. local address: %p, length: %lu", local_address, (unsigned long) size)); - - if (cbfunc) { - comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, - cbfunc, cbcontext, cbdata); - if (OPAL_UNLIKELY(NULL == comp)) { - BTL_VERBOSE(("culd not allocate completion structure")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_VERBOSE(("mca_btl_uct_get_rkey returned %d", rc)); - mca_btl_uct_uct_completion_release (comp); - return rc; - } - - mca_btl_uct_context_lock (context); - - if (size <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.get.max_bcopy) { - ucs_status = uct_ep_get_bcopy (ep_handle, mca_btl_uct_get_unpack, local_address, size, remote_address, - rkey.rkey, &comp->uct_comp); - } else { - uct_iov_t iov = {.buffer = local_address, .length = size, .stride = 0, .count = 1, - .memh = MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(local_handle)->uct_memh}; - ucs_status = uct_ep_get_zcopy (ep_handle, &iov, 1, remote_address, rkey.rkey, &comp->uct_comp); - } - - /* go ahead and progress the worker while we have the lock (if we are not in an AM callback) */ - if (!context->in_am_callback) { - (void) uct_worker_progress (context->uct_worker); - } - - mca_btl_uct_context_unlock (context); - - if (!context->in_am_callback) { - mca_btl_uct_device_handle_completions (context); - } - - if (UCS_OK == ucs_status && cbfunc) { - /* if UCS_OK is returned the callback will never fire so we have to make the callback - * ourselves */ - cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - } - - if (UCS_INPROGRESS == ucs_status) { - ucs_status = UCS_OK; - } else { - mca_btl_uct_uct_completion_release (comp); - } - - BTL_VERBOSE(("get issued. status = %d", ucs_status)); - - uct_rkey_release (&rkey); - - return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; -} - -struct mca_btl_uct_put_pack_args_t { - void *local_address; - size_t size; -}; - -typedef struct mca_btl_uct_put_pack_args_t mca_btl_uct_put_pack_args_t; - -static size_t mca_btl_uct_put_pack (void *dest, void *arg) -{ - mca_btl_uct_put_pack_args_t *args = (mca_btl_uct_put_pack_args_t *) arg; - - memcpy (dest, args->local_address, args->size); - return args->size; -} - -int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); - mca_btl_uct_uct_completion_t *comp = NULL; - ucs_status_t ucs_status; - uct_rkey_bundle_t rkey; - uct_ep_h ep_handle; - bool use_short = false; - bool use_bcopy = false; - int rc; - - BTL_VERBOSE(("performing put operation. local address: %p, length: %lu", local_address, (unsigned long) size)); - - if (size > uct_btl->super.btl_put_local_registration_threshold && cbfunc) { - comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, - cbfunc, cbcontext, cbdata); - if (OPAL_UNLIKELY(NULL == comp)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_uct_completion_release (comp); - return rc; - } - - mca_btl_uct_context_lock (context); - - /* determine what UCT prototol should be used */ - if (size <= uct_btl->super.btl_put_local_registration_threshold) { - use_short = size <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.put.max_short; - use_bcopy = !use_short; - } - - do { - if (use_short) { - ucs_status = uct_ep_put_short (ep_handle, local_address, size, remote_address, rkey.rkey); - } else if (use_bcopy) { - ssize_t tmp = uct_ep_put_bcopy (ep_handle, mca_btl_uct_put_pack, - &(mca_btl_uct_put_pack_args_t) {.local_address = local_address, - .size = size}, - remote_address, rkey.rkey); - ucs_status = (tmp == (ssize_t) size) ? UCS_OK : UCS_ERR_NO_RESOURCE; - } else { - uct_iov_t iov = {.buffer = local_address, .length = size, .stride = 0, .count = 1, - .memh = MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(local_handle)->uct_memh}; - - ucs_status = uct_ep_put_zcopy (ep_handle, &iov, 1, remote_address, rkey.rkey, &comp->uct_comp); - } - - /* go ahead and progress the worker while we have the lock */ - if (UCS_ERR_NO_RESOURCE != ucs_status || context->in_am_callback) { - if (!context->in_am_callback) { - (void) uct_worker_progress (context->uct_worker); - } - - break; - } - - /* wait for something to complete */ - while (!uct_worker_progress (context->uct_worker)); - } while (1); - - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - - if (UCS_OK == ucs_status && cbfunc) { - /* if UCS_OK is returned the callback will never fire so we have to make the callback - * ourselves. this callback is possibly being made before the data is visible to the - * remote process. */ - cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); - } - - if (UCS_INPROGRESS == ucs_status) { - ucs_status = UCS_OK; - } else { - mca_btl_uct_uct_completion_release (comp); - } - - uct_rkey_release (&rkey); - - return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; -} - -int mca_btl_uct_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - const int tl_index = uct_btl->rdma_tl->tl_index; - const int context_count = mca_btl_uct_component.num_contexts_per_module; - ucs_status_t ucs_status; - - BTL_VERBOSE(("mca_btl_uct_flush starting")); - - for (int i = 0 ; i < context_count ; ++i) { - mca_btl_uct_device_context_t *context = uct_btl->rdma_tl->uct_dev_contexts[i]; - - if (NULL == context) { - continue; - } - - mca_btl_uct_context_lock (context); - /* this loop is here because at least some of the TLs do no support a - * completion callback. its a real PIA but has to be done for now. */ - do { - uct_worker_progress (context->uct_worker); - - if (NULL != endpoint && endpoint->uct_eps[context->context_id][tl_index].uct_ep) { - ucs_status = uct_ep_flush (endpoint->uct_eps[context->context_id][tl_index].uct_ep, 0, NULL); - } else { - ucs_status = uct_iface_flush (context->uct_iface, 0, NULL); - } - } while (UCS_INPROGRESS == ucs_status); - - mca_btl_uct_context_unlock (context); - mca_btl_uct_device_handle_completions (context); - } - - return OPAL_SUCCESS; -} - -int mca_btl_uct_flush_thread (mca_btl_base_module_t *btl) -{ - mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; - const int context_id = mca_btl_uct_get_context_index (); - mca_btl_uct_device_context_t *context = uct_btl->rdma_tl->uct_dev_contexts[context_id]; - ucs_status_t ucs_status; - - BTL_VERBOSE(("mca_btl_uct_flush_thread starting")); - - if (NULL == context) { - return OPAL_SUCCESS; - } - - mca_btl_uct_context_lock (context); - - /* this loop is here because at least some of the TLs do no support a - * completion callback. its a real PIA but has to be done for now. */ - do { - uct_worker_progress (context->uct_worker); - ucs_status = uct_iface_flush (context->uct_iface, 0, NULL); - } while (UCS_INPROGRESS == ucs_status); - - mca_btl_uct_context_unlock (context); - - mca_btl_uct_device_handle_completions (context); - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h deleted file mode 100644 index e9b0d6b19dc..00000000000 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ /dev/null @@ -1,62 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(BTL_UCT_RDMA_H) -#define BTL_UCT_RDMA_H - -#include "btl_uct.h" -#include "btl_uct_endpoint.h" -#include "btl_uct_frag.h" - -/** - * @brief allocate a callback structure - */ -mca_btl_uct_uct_completion_t *mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *btl, mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_uct_device_context_t *dev_context, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); -/** - * @brief release a callback structure - */ -void mca_btl_uct_uct_completion_release (mca_btl_uct_uct_completion_t *comp); - -void mca_btl_uct_uct_completion (uct_completion_t *uct_comp, ucs_status_t status); - -/** - * @brief unpack the registration key and ensure the endpoint is connected - * - * @param[in] module uct btl module - * @param[in] context device context to use - * @param[in] endpoint btl endpoint - * @param[in] remote_handle buffer containing remote handle data - * @param[inout] rkey uct registration key bundle - * @param[out] ep_handle uct endpoint handle - */ -static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module, - mca_btl_uct_device_context_t *context, - mca_btl_base_endpoint_t *endpoint, - mca_btl_base_registration_handle_t *remote_handle, - uct_rkey_bundle_t *rkey, - uct_ep_h *ep_handle) -{ - ucs_status_t ucs_status; - int rc; - - rc = mca_btl_uct_endpoint_check_rdma (module, endpoint, context, ep_handle); - if (OPAL_SUCCESS != rc) { - return rc; - } - - ucs_status = uct_rkey_unpack ((void *) remote_handle, rkey); - return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; -} - -#endif /* !defined(BTL_UCT_RDMA_H) */ diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c deleted file mode 100644 index be70af6ec8b..00000000000 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ /dev/null @@ -1,644 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_uct_device_context.h" -#include "btl_uct_am.h" -#include "opal/util/bit_ops.h" -#include "opal/util/argv.h" - -#if HAVE_DECL_UCT_CB_FLAG_SYNC -#define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC -#else -#define MCA_BTL_UCT_CB_FLAG_SYNC 0 -#endif - -/** - * @brief Convert UCT capabilities to BTL flags - */ -static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = { - {UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND}, - {UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT}, - {UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET}, - {0,0}, -}; - -/** - * @brief Convert UCT capability flags to BTL flags - * - * @param[in] cap_flags UCT capability flags - * - * @returns equivalent BTL flags - */ -static int32_t mca_btl_uct_module_flags (uint64_t cap_flags) -{ - uint32_t flags = 0; - - for (int i = 0 ; mca_btl_uct_cap_to_btl_flag[i][0] > 0 ; ++i) { - if (cap_flags & mca_btl_uct_cap_to_btl_flag[i][0]) { - flags |= (uint32_t) mca_btl_uct_cap_to_btl_flag[i][1]; - } - } - return flags; -} - -#if OPAL_HAVE_UCT_EP_ATOMIC64_POST -/** - * @brief Convert UCT capabilities to BTL atomic flags - */ -static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { - {UCS_BIT(UCT_ATOMIC_OP_ADD), MCA_BTL_ATOMIC_SUPPORTS_ADD}, - {UCS_BIT(UCT_ATOMIC_OP_AND), MCA_BTL_ATOMIC_SUPPORTS_AND}, - {UCS_BIT(UCT_ATOMIC_OP_OR), MCA_BTL_ATOMIC_SUPPORTS_OR}, - {UCS_BIT(UCT_ATOMIC_OP_XOR), MCA_BTL_ATOMIC_SUPPORTS_XOR}, - {UCS_BIT(UCT_ATOMIC_OP_SWAP), MCA_BTL_ATOMIC_SUPPORTS_SWAP}, - {UCS_BIT(UCT_ATOMIC_OP_CSWAP), MCA_BTL_ATOMIC_SUPPORTS_CSWAP}, - {0, }, -}; - -static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; - - /* NTH: only use the fetching atomics for now */ - uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags; - uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags; - - /* NTH: don't really have a way to seperate 32-bit and 64-bit right now */ - uint64_t all_flags = atomic_flags32 & atomic_flags64; - - module->super.btl_atomic_flags = 0; - - if (cap_flags & UCT_IFACE_FLAG_ATOMIC_CPU) { - module->super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; - } - - for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] ; ++i) { - if (all_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) { - module->super.btl_atomic_flags |= mca_btl_uct_cap_to_btl_atomic_flag[i][1]; - } - } - - if (0 != module->super.btl_atomic_flags) { - /* some atomics are supported */ - module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS; - } -} - -#else -/** - * @brief Convert UCT capabilities to BTL atomic flags - */ -static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { - {UCT_IFACE_FLAG_ATOMIC_ADD64, MCA_BTL_ATOMIC_SUPPORTS_ADD}, - {UCT_IFACE_FLAG_ATOMIC_ADD32, MCA_BTL_ATOMIC_SUPPORTS_32BIT}, - {UCT_IFACE_FLAG_ATOMIC_CSWAP64, MCA_BTL_ATOMIC_SUPPORTS_CSWAP}, - {UCT_IFACE_FLAG_ATOMIC_SWAP64, MCA_BTL_ATOMIC_SUPPORTS_SWAP}, - {UCT_IFACE_FLAG_ATOMIC_CPU, MCA_BTL_ATOMIC_SUPPORTS_GLOB}, - {0, }, -}; - -/** - * @brief Convert UCT capability flags to BTL atomic flags - * - * @param[in] cap_flags UCT capability flags - * - * @returns equivalent BTL atomic flags - */ -static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; - - module->super.btl_atomic_flags = 0; - - for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] > 0 ; ++i) { - if (cap_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) { - module->super.btl_atomic_flags |= (uint32_t) mca_btl_uct_cap_to_btl_atomic_flag[i][1]; - } - } - - if (0 != module->super.btl_atomic_flags) { - /* some atomics are supported */ - module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS; - } -} - -#endif - -static void mca_btl_uct_tl_constructor (mca_btl_uct_tl_t *tl) -{ - memset ((void *)((uintptr_t) tl + sizeof (tl->super)), 0, sizeof (*tl) - sizeof (tl->super)); - OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t); -} - -static void mca_btl_uct_tl_destructor (mca_btl_uct_tl_t *tl) -{ - assert (((opal_object_t *) tl)->obj_reference_count == 0); - - for (int context_id = 0 ; context_id < MCA_BTL_UCT_MAX_WORKERS ; ++context_id) { - if (NULL != tl->uct_dev_contexts[context_id]) { - mca_btl_uct_context_destroy (tl->uct_dev_contexts[context_id]); - } - } - - if (tl->uct_md) { - OBJ_RELEASE(tl->uct_md); - } - - free (tl->uct_dev_contexts); - free (tl->uct_tl_name); - free (tl->uct_dev_name); - - if (NULL != tl->uct_tl_config) { - uct_config_release (tl->uct_tl_config); - } - - OBJ_DESTRUCT(&tl->tl_lock); -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, mca_btl_uct_tl_destructor); - -static ucs_status_t mca_btl_uct_conn_req_cb (void *arg, void *data, size_t length, unsigned flags) -{ - mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg; - mca_btl_uct_pending_connection_request_t *request = calloc (1, length + sizeof (request->super)); - - /* it is not safe to process the connection request from the callback so just save it for - * later processing */ - OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t); - memcpy (&request->request_data, (void *) ((intptr_t) data + 8), length); - opal_fifo_push_atomic (&module->pending_connection_reqs, &request->super); - - return UCS_OK; -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_pending_connection_request_t, opal_list_item_t, NULL, NULL); - -int mca_btl_uct_process_connection_request (mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req) -{ - struct opal_proc_t *remote_proc = opal_proc_for_name (req->proc_name); - mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep (&module->super, remote_proc); - mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->context_id] + req->tl_index; - int32_t ep_flags; - int rc; - - BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d", - (void *) endpoint, req->type, req->context_id)); - - if (NULL == endpoint) { - BTL_ERROR(("could not create endpoint for connection request")); - return UCS_ERR_UNREACHABLE; - } - - assert (req->type < 2); - - ep_flags = opal_atomic_fetch_or_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC); - - if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) { - /* create any necessary resources */ - rc = mca_btl_uct_endpoint_connect (module, endpoint, req->context_id, req->ep_addr, req->tl_index); - if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) { - BTL_ERROR(("could not setup rdma endpoint. rc = %d", rc)); - return rc; - } - } - - /* the connection is ready once we have received the connection data and also a connection ready - * message. this might be overkill but there is little documentation at the UCT level on when - * an endpoint can be used. */ - if (req->type == 1) { - /* remote side is ready */ - mca_btl_uct_base_frag_t *frag; - - /* to avoid a race with send adding pending frags grab the lock here */ - OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock,{ - BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags", opal_list_get_size (&module->pending_frags))); - (void) opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY); - opal_atomic_wmb (); - - OPAL_LIST_FOREACH(frag, &module->pending_frags, mca_btl_uct_base_frag_t) { - if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) { - frag->ready = true; - } - } - }); - } - - return OPAL_SUCCESS; -} - -static int mca_btl_uct_setup_connection_tl (mca_btl_uct_module_t *module) -{ - ucs_status_t ucs_status; - - if (NULL == module->conn_tl) { - return OPAL_ERR_NOT_SUPPORTED; - } - - ucs_status = uct_iface_set_am_handler (module->conn_tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_CONNECT_RDMA, - mca_btl_uct_conn_req_cb, module, UCT_CB_FLAG_ASYNC); - if (UCS_OK != ucs_status) { - BTL_ERROR(("could not set active message handler for uct tl")); - } - - return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR; -} - -static void mca_btl_uct_context_enable_progress (mca_btl_uct_device_context_t *context) -{ - if (!context->progress_enabled) { -#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE - uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | - UCT_PROGRESS_RECV); -#else - uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#endif - context->progress_enabled = true; - } -} - -mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress) -{ - uct_iface_params_t iface_params = {.rndv_cb = NULL, .eager_cb = NULL, .stats_root = NULL, - .rx_headroom = 0, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, - .mode = {.device = {.tl_name = tl->uct_tl_name, - .dev_name = tl->uct_dev_name}}}; - mca_btl_uct_device_context_t *context; - ucs_status_t ucs_status; - int rc; - - context = calloc (1, sizeof (*context)); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - - context->context_id = context_id; - context->uct_btl = module; - OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); - OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); - - rc = opal_free_list_init (&context->rdma_completions, sizeof (mca_btl_uct_uct_completion_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), - 0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, - NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_context_destroy (context); - return NULL; - } - - /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to - * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their - * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the - * various UCT calls. */ - ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - BTL_VERBOSE(("could not create a UCT worker")); - mca_btl_uct_context_destroy (context); - return NULL; - } - - ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params, - tl->uct_tl_config, &context->uct_iface); - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); - mca_btl_uct_context_destroy (context); - return NULL; - } - - /* only need to query one of the interfaces to get the attributes */ - ucs_status = uct_iface_query (context->uct_iface, &context->uct_iface_attr); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Error querying UCT interface")); - mca_btl_uct_context_destroy (context); - return NULL; - } - - if (context_id > 0 && tl == module->am_tl) { - BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); - uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, - context, MCA_BTL_UCT_CB_FLAG_SYNC); - } - - if (enable_progress) { - BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id)); - mca_btl_uct_context_enable_progress (context); - } - - return context; -} - -void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context) -{ - if (context->uct_iface) { - uct_iface_close (context->uct_iface); - context->uct_iface = NULL; - } - - if (context->uct_worker) { - uct_worker_destroy (context->uct_worker); - context->uct_worker = NULL; - } - - OBJ_DESTRUCT(&context->completion_fifo); - OBJ_DESTRUCT(&context->rdma_completions); - free (context); -} - -static int tl_compare (opal_list_item_t **a, opal_list_item_t **b) -{ - mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; - mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; - - return tl_a->priority - tl_b->priority; -} - -static mca_btl_uct_tl_t *mca_btl_uct_create_tl (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) -{ - mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t); - - if (OPAL_UNLIKELY(NULL == tl)) { - return NULL; - } - - /* initialize btl tl structure */ - tl->uct_md = md; - OBJ_RETAIN(md); - - tl->uct_tl_name = strdup (tl_desc->tl_name); - tl->uct_dev_name = strdup (tl_desc->dev_name); - tl->priority = priority; - - tl->uct_dev_contexts = calloc (MCA_BTL_UCT_MAX_WORKERS, sizeof (tl->uct_dev_contexts[0])); - if (NULL == tl->uct_dev_contexts) { - OBJ_RELEASE(tl); - return NULL; - } - - (void) uct_md_iface_config_read (md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); - - /* always create a 0 context (needed to query) */ - tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0, false); - if (NULL == tl->uct_dev_contexts[0]) { - BTL_VERBOSE(("could not create a uct device context")); - OBJ_RELEASE(tl); - return NULL; - } - - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, - (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); - - return tl; -} - -static void mca_btl_uct_set_tl_rdma (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name)); - - mca_btl_uct_module_set_atomic_flags (module, tl); - - module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy; - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) { - module->super.btl_get_alignment = 0; - module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy; - } else { - /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */ - module->super.btl_get_alignment = opal_next_poweroftwo_inclusive (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy); - } - - module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy; - module->super.btl_put_alignment = 0; - - /* no registration needed when using short/bcopy put */ - module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_bcopy; - - module->rdma_tl = tl; - OBJ_RETAIN(tl); - - tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; - if (tl->max_device_contexts <= 1) { - tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; - } -} - -static void mca_btl_uct_set_tl_am (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); - - if (module->rdma_tl == tl) { - module->shared_endpoints = true; - } - module->am_tl = tl; - OBJ_RETAIN(tl); - - uct_iface_set_am_handler (tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG, - mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC); - - tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; - if (tl->max_device_contexts <= 1) { - tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; - } - - module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy - sizeof (mca_btl_uct_am_header_t); - module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy - sizeof (mca_btl_uct_am_header_t); -} - -static int mca_btl_uct_set_tl_conn (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - int rc; - - BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); - - module->conn_tl = tl; - rc = mca_btl_uct_setup_connection_tl (module); - if (OPAL_SUCCESS != rc) { - return rc; - } - - OBJ_RETAIN(tl); - - if (!tl->max_device_contexts) { - /* if a tl is only being used to create connections do not bother with multiple - * contexts */ - tl->max_device_contexts = 1; - } - - return OPAL_SUCCESS; -} - -static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) -{ - int rc; - - BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name)); - if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma (tl)) { - mca_btl_uct_set_tl_rdma (module, tl); - } - - if (NULL == module->am_tl && mca_btl_uct_tl_support_am (tl)) { - mca_btl_uct_set_tl_am (module, tl); - } - - if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn (tl)) { - rc = mca_btl_uct_set_tl_conn (module, tl); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - - if (tl == module->rdma_tl || tl == module->am_tl) { - BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); - module->super.btl_flags |= mca_btl_uct_module_flags (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags); - - /* the bandwidth and latency numbers relate to both rdma and active messages. need to - * come up with a better estimate. */ - - /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ - module->super.btl_bandwidth = (uint32_t) (MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0); - /* TODO -- figure out how to translate UCT latency to us */ - module->super.btl_latency = 1; - } - - if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) { - /* make sure progress is enabled on the default context now that we know this TL will be used */ - mca_btl_uct_context_enable_progress (tl->uct_dev_contexts[0]); - } - - return OPAL_SUCCESS; -} - -int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count) -{ - bool include = true, any = false; - mca_btl_uct_tl_t *tl; - opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; - - OBJ_CONSTRUCT(&tl_list, opal_list_t); - - tl_filter = opal_argv_split (mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup (tl_filter[0] + 1); - - free (tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0 ; tl_filter[j] ; ++j) { - if (0 == strcmp (tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free (tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } - - for (unsigned i = 0 ; i < tl_count ; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0 ; tl_filter[j] ; ++j) { - if (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; - } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, try_tl, priority)); - - if (!try_tl) { - continue; - } - - if (0 == strcmp (tl_descs[i].tl_name, "ud")) { - /* ud looks like any normal transport but we do not want to use it for anything other - * than connection management so ensure it gets evaluated last */ - priority = INT_MAX; - } - - tl = mca_btl_uct_create_tl (module, md, tl_descs + i, priority); - - if (tl) { - opal_list_append (&tl_list, &tl->super); - } - } - - opal_argv_free (tl_filter); - - if (0 == opal_list_get_size (&tl_list)) { - BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); - OBJ_DESTRUCT(&tl_list); - return OPAL_ERR_NOT_AVAILABLE; - } - - opal_list_sort (&tl_list, tl_compare); - - OPAL_LIST_FOREACH(tl, &tl_list, mca_btl_uct_tl_t) { - mca_btl_uct_evaluate_tl (module, tl); - if (NULL != module->am_tl && NULL != module->rdma_tl && - (NULL != module->conn_tl || !(mca_btl_uct_tl_requires_connection_tl (module->am_tl) || - mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)))) { - /* all done */ - break; - } - } - - if (NULL == module->rdma_tl) { - /* no rdma tls */ - BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); - - module->super.btl_put = NULL; - module->super.btl_get = NULL; - module->super.btl_atomic_fop = NULL; - module->super.btl_atomic_op = NULL; - } - - if (NULL == module->am_tl) { - /* no active message tls == no send/recv */ - BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); - - module->super.btl_send = NULL; - module->super.btl_sendi = NULL; - module->super.btl_alloc = NULL; - module->super.btl_free = NULL; - } - - OPAL_LIST_DESTRUCT(&tl_list); - - if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl (module->am_tl)) && - !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)) && - module->conn_tl) { - /* no connection tl needed for selected transports */ - OBJ_RELEASE(module->conn_tl); - module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h deleted file mode 100644 index 7b7a4eaa69b..00000000000 --- a/opal/mca/btl/uct/btl_uct_types.h +++ /dev/null @@ -1,343 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#if !defined(BTL_UCT_TYPES_H) -#define BTL_UCT_TYPES_H - -#include "opal/mca/btl/btl.h" - -/* forward declarations */ -struct mca_btl_uct_module_t; -struct mca_btl_base_endpoint_t; -struct mca_btl_uct_base_frag_t; - -/* TL endpoint flags */ -/** connection data was received */ -#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC 0x1 -/** remote endpoint read */ -#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY 0x2 -/** connection was established */ -#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x4 - -/* AM tags */ -/** BTL fragment */ -#define MCA_BTL_UCT_FRAG 0x0d -/** connection request */ -#define MCA_BTL_UCT_CONNECT_RDMA 0x0e - -/** maximum number of modules supported by the btl component */ -#define MCA_BTL_UCT_MAX_MODULES 16 -/** maximum number of UCT workers */ -#define MCA_BTL_UCT_MAX_WORKERS 64 - -/** - * @brief MODEx data - */ -struct mca_btl_uct_modex_t { - /** number of modules whose data is stored in this modex */ - int32_t module_count; - - /** variable length modex data */ - uint8_t data[]; -}; - -typedef struct mca_btl_uct_modex_t mca_btl_uct_modex_t; - -/** - * @brief BTL UCT memory domain structure - * - * Each BTL module supports a single memory domain. Each memory domain has - * one or more transport layers. - */ -struct mca_btl_uct_md_t { - /** make this an opal object */ - opal_object_t super; - - /** UCT memory domain handle */ - uct_md_h uct_md; -}; - -typedef struct mca_btl_uct_md_t mca_btl_uct_md_t; - -OBJ_CLASS_DECLARATION(mca_btl_uct_md_t); - - -/** - * @brief Connection request structure - */ -struct mca_btl_uct_conn_req_t { - /** name of the requesting process */ - opal_process_name_t proc_name; - - /** request type: 0 == endpoint data, 1 == endpoint data + remote ready */ - int type; - - /** context id that should be connected */ - int context_id; - - /** transport index that should be connected */ - int tl_index; - - /** endpoint address data */ - uint8_t ep_addr[]; -}; - -typedef struct mca_btl_uct_conn_req_t mca_btl_uct_conn_req_t; - -/** - * @brief Transport endpoint stucture - */ -struct mca_btl_uct_tl_endpoint_t { - /** current flags (connected, requested, etc) */ - volatile int32_t flags; - - /** UCT endpoint handle */ - uct_ep_h uct_ep; -}; - -typedef struct mca_btl_uct_tl_endpoint_t mca_btl_uct_tl_endpoint_t; - -/** - * @brief Structure to keep track of connection endpoints - */ -struct mca_btl_uct_connection_ep_t { - /** opal base object */ - opal_object_t super; - - /** UCT endpoint used for connection */ - uct_ep_h uct_ep; -}; - -typedef struct mca_btl_uct_connection_ep_t mca_btl_uct_connection_ep_t; - -OBJ_CLASS_DECLARATION(mca_btl_uct_connection_ep_t); - -/** - * @brief Context for UCT device interface - * - * This structure uses atomic locks to protect the UCT worker (which is not thread safe). - * In order to make device access fast pthread mutexes are not used. To deal with recursion - * (unavoidable with active messages) we implement an atomic lock using C11 atomics (or - * pthread thread-specific values with older compilers). - */ -struct mca_btl_uct_device_context_t { - /** index of this context */ - int context_id; - - /** btl module this context is associated with */ - struct mca_btl_uct_module_t *uct_btl; - - /** mutex for protecting the UCT worker */ - opal_recursive_mutex_t mutex; - - /** UCT worker handle */ - uct_worker_h uct_worker; - - /** UCT interface handle */ - uct_iface_h uct_iface; - - /** interface attributes */ - uct_iface_attr_t uct_iface_attr; - - /** RDMA completions */ - opal_free_list_t rdma_completions; - - /** complete fragments and rdma operations. this fifo is used to avoid making - * callbacks while holding the device lock. */ - opal_fifo_t completion_fifo; - - /** progress is enabled on this context */ - bool progress_enabled; - - /** context is in AM callback */ - volatile bool in_am_callback; -}; - -typedef struct mca_btl_uct_device_context_t mca_btl_uct_device_context_t; - -/** - * @brief Header for all BTL UCT active messages - */ -union mca_btl_uct_am_header_t { - /** active message header data */ - struct mca_btl_uct_am_header_data_t { - /** callback tag */ - mca_btl_base_tag_t tag; - - /** padding */ - uint8_t padding[7]; - } data; - - /** header value. this is 64-bits to support using this with uct_ep_am_short */ - uint64_t value; -}; - -typedef union mca_btl_uct_am_header_t mca_btl_uct_am_header_t; - -/** - * @brief structure to keep track of btl callback - * - * This structuere is passed to various uct functions. It - * does the translation between the uct callback and the - * btl callback. - */ -struct mca_btl_uct_uct_completion_t { - /** allocated from a free list */ - opal_free_list_item_t super; - - /** uct completion structure */ - uct_completion_t uct_comp; - - /** AM completion context */ - struct mca_btl_uct_base_frag_t *frag; - - /** btl module associated with the callback */ - struct mca_btl_base_module_t *btl; - - /** btl endpoint associated with the callback */ - struct mca_btl_base_endpoint_t *endpoint; - - /** local address */ - void *local_address; - - /** local registration handle */ - mca_btl_base_registration_handle_t *local_handle; - - /** user callback function */ - mca_btl_base_rdma_completion_fn_t cbfunc; - - /** user callback context */ - void *cbcontext; - - /** user callback data */ - void *cbdata; - - /** device context */ - mca_btl_uct_device_context_t *dev_context; - - /** status */ - int status; -}; - -typedef struct mca_btl_uct_uct_completion_t mca_btl_uct_uct_completion_t; - -OBJ_CLASS_DECLARATION(mca_btl_uct_uct_completion_t); - -/** - * @brief Base fragment structure - */ -struct mca_btl_uct_base_frag_t { - /** btl base fragment */ - mca_btl_base_descriptor_t base; - - /** segments (used with the base fragment) */ - mca_btl_base_segment_t segments[2]; - - /** module this fragment is associated with */ - struct mca_btl_uct_module_t *btl; - - /* tl context */ - mca_btl_uct_device_context_t *context; - - /** is this frag ready to send (only used when pending) */ - bool ready; - - /** endpoint this fragment is associated with */ - struct mca_btl_base_endpoint_t *endpoint; - - /** free list this fragment was allocated from */ - opal_free_list_t *free_list; - - /** fragment btl/uct header */ - mca_btl_uct_am_header_t header; - - /** pre-filled UCT io vector */ - uct_iov_t uct_iov; - - /** completion structure */ - mca_btl_uct_uct_completion_t comp; -}; - -typedef struct mca_btl_uct_base_frag_t mca_btl_uct_base_frag_t; - -OBJ_CLASS_DECLARATION(mca_btl_uct_base_frag_t); - -struct mca_btl_base_endpoint_t { - /** opal base class */ - opal_object_t super; - - /** endpoint proc */ - opal_proc_t *ep_proc; - - /** mutex to protect this structure */ - opal_recursive_mutex_t ep_lock; - - /** cached connection endpoint */ - mca_btl_uct_connection_ep_t *conn_ep; - - /** endpoints into UCT for this BTL endpoint */ - mca_btl_uct_tl_endpoint_t uct_eps[][2]; -}; - -typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; -typedef mca_btl_base_endpoint_t mca_btl_uct_endpoint_t; -OBJ_CLASS_DECLARATION(mca_btl_uct_endpoint_t); - -/** - * @brief BTL UCT abstraction of a UCT transport layer - */ -struct mca_btl_uct_tl_t { - /** make this an opal object */ - opal_list_item_t super; - - /** relative priority 0 == highest */ - int priority; - - /** memory domain associated with this tl */ - mca_btl_uct_md_t *uct_md; - - /** lock protecting tl structures */ - opal_mutex_t tl_lock; - - /** tl configuration (used for creating device contexts) */ - uct_iface_config_t *uct_tl_config; - - /** name of this tl (used for creating device contexts) */ - char *uct_tl_name; - - /** device name for this tl (used for creating device contexts) */ - char *uct_dev_name; - - /** maxiumum number of device contexts that can be created */ - int max_device_contexts; - - /** array of device contexts */ - mca_btl_uct_device_context_t **uct_dev_contexts; - - /** tl index. this is used to differentiate (if there is any difference) - * between rdma and am endpoints */ - int tl_index; -}; - -typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; -OBJ_CLASS_DECLARATION(mca_btl_uct_tl_t); - -#define MCA_BTL_UCT_TL_ATTR(tl, context_id) (tl)->uct_dev_contexts[(context_id)]->uct_iface_attr - -struct mca_btl_uct_pending_connection_request_t { - opal_list_item_t super; - uint8_t request_data[]; -}; - -typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; -OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); - -#endif /* !defined(BTL_UCT_TYPES_H) */ diff --git a/opal/mca/btl/uct/configure.m4 b/opal/mca/btl/uct/configure.m4 deleted file mode 100644 index 82844857740..00000000000 --- a/opal/mca/btl/uct/configure.m4 +++ /dev/null @@ -1,60 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2006 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 QLogic Corp. All rights reserved. -# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2018 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2018 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# OPAL_CHECK_UCX(prefix, [action-if-found], [action-if-not-found]) -# -------------------------------------------------------- -# check if UCX support can be found. sets prefix_{CPPFLAGS, -# LDFLAGS, LIBS} as needed and runs action-if-found if there is -# support, otherwise executes action-if-not-found - -AC_DEFUN([MCA_opal_btl_uct_CONFIG],[ - AC_CONFIG_FILES([opal/mca/btl/uct/Makefile]) - - OMPI_CHECK_UCX([btl_uct], - [btl_uct_happy="yes"], - [btl_uct_happy="no"]) - if test "$btl_uct_happy" = "yes" ; then - OPAL_VAR_SCOPE_PUSH([CPPFLAGS_save]) - - CPPFLAGS_save="$CPPFLAGS" - CPPFLAGS="$CPPFLAGS $btl_uct_CPPFLAGS" - - AC_CHECK_DECLS([UCT_PROGRESS_THREAD_SAFE, UCT_CB_FLAG_SYNC], [], [], [[#include ]]) - - CPPFLAGS="$CPPFLAGS_save" - OPAL_VAR_SCOPE_POP - fi - - AS_IF([test "$btl_uct_happy" = "yes"], - [$1 - btl_uct_LIBS="$btl_uct_LIBS -luct" - ], - [$2]) - - # substitute in the things needed to build ucx - AC_SUBST([btl_uct_CPPFLAGS]) - AC_SUBST([btl_uct_LDFLAGS]) - AC_SUBST([btl_uct_LIBS]) -])dnl diff --git a/opal/mca/btl/uct/owner.txt b/opal/mca/btl/uct/owner.txt deleted file mode 100644 index 4918816bc9e..00000000000 --- a/opal/mca/btl/uct/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner:LANL -status:active