Skip to content

Commit 039e1b5

Browse files
committed
opal/mca/ofi: select NIC closest to accelerator if requested
When accelerator is requested, select the closest NIC to the accelerator device. If the accelerator or NIC PCI information is not available, fallback to select the NIC on the closest package. Signed-off-by: Wenduo Wang <[email protected]>
1 parent 8eaa49a commit 039e1b5

File tree

1 file changed

+188
-3
lines changed

1 file changed

+188
-3
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 188 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common_ofi.h"
3232
#include "opal/constants.h"
33+
#include "opal/mca/accelerator/accelerator.h"
3334
#include "opal/mca/base/mca_base_framework.h"
3435
#include "opal/mca/base/mca_base_var.h"
3536
#include "opal/mca/hwloc/base/base.h"
@@ -38,6 +39,7 @@
3839
#include "opal/util/argv.h"
3940
#include "opal/util/show_help.h"
4041

42+
extern opal_accelerator_base_module_t opal_accelerator;
4143
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4244
.prov_exclude = NULL,
4345
.output = -1};
@@ -778,6 +780,168 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
778780
return (uint32_t) process_info->myprocid.rank;
779781
}
780782

783+
static int get_obj_depth(hwloc_obj_t obj, int *depth)
784+
{
785+
hwloc_obj_t parent = NULL;
786+
int depth_from_obj = 0;
787+
788+
/* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
789+
if (0 <= (int) obj->depth) {
790+
*depth = obj->depth;
791+
return OPAL_SUCCESS;
792+
}
793+
794+
parent = obj->parent;
795+
while (parent) {
796+
++depth_from_obj;
797+
if (0 <= (int) parent->depth) {
798+
*depth = parent->depth + depth_from_obj;
799+
return OPAL_SUCCESS;
800+
}
801+
parent = obj->parent;
802+
}
803+
804+
return OPAL_ERROR;
805+
}
806+
807+
#if OPAL_OFI_PCI_DATA_AVAILABLE
808+
/**
809+
* @brief Attempt to find a nearest provider from the accelerator.
810+
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
811+
* shortest distance.
812+
* Special cases:
813+
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
814+
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
815+
* return OPAL_ERR_NOT_AVAILABLE.
816+
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
817+
* i.e. (local rank on the same accelerator) % (number of nearest providers)
818+
* @param[in] provider_list linked list of providers
819+
* @param[in] num_providers number of providers
820+
* @param[in] device_rank local rank on the accelerator
821+
* @param[out] provider pointer to the selected provider
822+
* @return OPAL_SUCCESS if a provider is successfully selected
823+
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
824+
* OPAL_ERROR if a fatal error happened
825+
*/
826+
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
827+
size_t num_providers, uint32_t device_rank,
828+
struct fi_info **provider)
829+
{
830+
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
831+
int ret = -1, accl_id = -1, depth = -1, max_common_ancestor_depth = -1;
832+
opal_accelerator_pci_attr_t accl_pci_attr = {0};
833+
struct fi_info *current_provider = NULL;
834+
struct fi_pci_attr pci = {0};
835+
uint32_t near_provider_count = 0, provider_rank = 0;
836+
uint32_t distances[num_providers], *distance = distances;
837+
838+
memset(distances, 0, sizeof(distances));
839+
840+
ret = opal_accelerator.get_device(&accl_id);
841+
if (OPAL_SUCCESS != ret) {
842+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
843+
__FILE__, __LINE__);
844+
return OPAL_ERR_NOT_AVAILABLE;
845+
}
846+
847+
ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
848+
if (OPAL_SUCCESS != ret) {
849+
opal_output_verbose(1, opal_common_ofi.output,
850+
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
851+
return OPAL_ERROR;
852+
}
853+
854+
accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
855+
accl_pci_attr.bus_id, accl_pci_attr.device_id,
856+
accl_pci_attr.function_id);
857+
if (NULL == accl_dev) {
858+
opal_output_verbose(1, opal_common_ofi.output,
859+
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
860+
return OPAL_ERROR;
861+
}
862+
863+
opal_output_verbose(1, opal_common_ofi.output,
864+
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
865+
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
866+
accl_pci_attr.device_id, accl_pci_attr.function_id,
867+
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);
868+
869+
current_provider = provider_list;
870+
while (NULL != current_provider) {
871+
common_ancestor = NULL;
872+
if (0 == check_provider_attr(provider_list, current_provider)
873+
&& (NULL != current_provider->nic && NULL != current_provider->nic->bus_attr
874+
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI)) {
875+
pci = current_provider->nic->bus_attr->attr.pci;
876+
877+
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
878+
pci.device_id, pci.function_id);
879+
if (NULL == prov_dev) {
880+
opal_output_verbose(1, opal_common_ofi.output,
881+
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
882+
return OPAL_ERROR;
883+
}
884+
885+
common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
886+
prov_dev);
887+
if (!common_ancestor) {
888+
opal_output_verbose(
889+
1, opal_common_ofi.output,
890+
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
891+
__FILE__, __LINE__);
892+
/**
893+
* Return error because any 2 PCI devices should share at least one common ancestor,
894+
* i.e. root
895+
*/
896+
return OPAL_ERROR;
897+
}
898+
899+
ret = get_obj_depth(common_ancestor, &depth);
900+
if (OPAL_SUCCESS != ret) {
901+
opal_output_verbose(1, opal_common_ofi.output,
902+
"%s:%d:Failed to get common ancestor depth", __FILE__,
903+
__LINE__);
904+
return OPAL_ERROR;
905+
}
906+
907+
if (max_common_ancestor_depth < depth) {
908+
max_common_ancestor_depth = depth;
909+
near_provider_count = 1;
910+
} else if (max_common_ancestor_depth == depth) {
911+
++near_provider_count;
912+
}
913+
}
914+
915+
*(distance++) = !common_ancestor ? 0 : depth;
916+
current_provider = current_provider->next;
917+
}
918+
919+
if (0 == near_provider_count || 0 > max_common_ancestor_depth) {
920+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
921+
__FILE__, __LINE__);
922+
return OPAL_ERR_NOT_AVAILABLE;
923+
}
924+
925+
provider_rank = device_rank % near_provider_count;
926+
927+
distance = distances;
928+
current_provider = provider_list;
929+
while (NULL != current_provider) {
930+
if (max_common_ancestor_depth == *(distance++) && provider_rank == --near_provider_count) {
931+
*provider = current_provider;
932+
return OPAL_SUCCESS;
933+
}
934+
935+
current_provider = current_provider->next;
936+
}
937+
938+
assert(0 == near_provider_count);
939+
940+
return OPAL_ERROR;
941+
}
942+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
943+
944+
781945
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
782946
opal_process_info_t *process_info)
783947
{
@@ -793,6 +957,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793957
int ret;
794958
unsigned int num_provider = 0, provider_limit = 0;
795959
bool provider_found = false;
960+
uint32_t package_rank;
796961

797962
/* Initialize opal_hwloc_topology if it is not already */
798963
ret = opal_hwloc_base_get_topology();
@@ -802,8 +967,29 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
802967
__FILE__, __LINE__);
803968
}
804969

970+
/* Current process' local rank on the same package(socket) */
971+
package_rank = get_package_rank(process_info);
805972
provider_limit = count_providers(provider_list);
806973

974+
#if OPAL_OFI_PCI_DATA_AVAILABLE
975+
/**
976+
* If accelerator is enabled, select the closest provider to the accelerator.
977+
* Note: the function expects a local rank on the accelerator to break ties if there are
978+
* multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
979+
*/
980+
ret = find_nearest_provider_from_accelerator(provider_list, provider_limit, package_rank,
981+
&provider);
982+
if (!ret)
983+
return provider;
984+
985+
if (OPAL_ERR_NOT_AVAILABLE != ret) {
986+
opal_output_verbose(1, opal_common_ofi.output,
987+
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
988+
__FILE__, __LINE__, ret);
989+
return provider_list;
990+
}
991+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
992+
807993
/* Allocate memory for provider table */
808994
provider_table = calloc(provider_limit, sizeof(struct fi_info *));
809995
if (NULL == provider_table) {
@@ -820,11 +1006,11 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
8201006
distances = get_nearest_nics(&num_distances, &pmix_val);
8211007
#endif
8221008

823-
current_provider = provider;
1009+
current_provider = provider_list;
8241010

8251011
/* Cycle through remaining fi_info objects, looking for alike providers */
8261012
while (NULL != current_provider) {
827-
if (!check_provider_attr(provider, current_provider)) {
1013+
if (!check_provider_attr(provider_list, current_provider)) {
8281014
near = false;
8291015
#if OPAL_OFI_PCI_DATA_AVAILABLE
8301016
if (NULL != current_provider->nic
@@ -853,7 +1039,6 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
8531039
}
8541040

8551041
/* Select provider from local rank % number of providers */
856-
uint32_t package_rank = get_package_rank(process_info);
8571042
if (num_provider >= 2) {
8581043
// If there are multiple NICs "close" to the process, try to calculate package_rank
8591044
provider = provider_table[package_rank % num_provider];

0 commit comments

Comments
 (0)