diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index d9888f23f22..985e2fd51e4 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -487,12 +487,24 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr && !check_ep_attr(provider_info->ep_attr, provider->ep_attr) && !(provider_info->caps & ~(provider->caps)) && !(provider_info->mode & ~(provider->mode)) && provider_info->addr_format == provider->addr_format) { - return 0; + return OPAL_SUCCESS; } else { return OPAL_ERROR; } } +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_nic_pci(struct fi_info *provider, struct fi_pci_attr *pci) +{ + if (NULL != provider->nic && NULL != provider->nic->bus_attr + && FI_BUS_PCI == provider->nic->bus_attr->bus_type) { + *pci = provider->nic->bus_attr->attr.pci; + return OPAL_SUCCESS; + } + return OPAL_ERR_NOT_AVAILABLE; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + /** * Calculate device distances * @@ -557,177 +569,250 @@ static int compute_dev_distances(pmix_device_distance_t **distances, } /** - * Find the nearest devices to the current thread + * @brief Get the provider distance from the provided distance metrics * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 1 or more. + * @param[in] topology hwloc topology + * @param[in] provider Provider object + * @param[in] distances List of known device distances + * @param[in] num_distances Length of distances + * @param[out] distance Pointer to store the provider distance + * @return OPAL_SUCCESS if and only if the distance is found in the provided list + */ +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_distance(hwloc_topology_t topology, struct fi_info *provider, + pmix_device_distance_t *distances, int num_distances, + uint16_t *distance) +{ + hwloc_obj_t pcidev, osdev; + struct fi_pci_attr pci = {0}; + + if (OPAL_SUCCESS != get_provider_nic_pci(provider, &pci)) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot determine PCI attributes of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + + pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, + pci.function_id); + if (!pcidev) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot locate PCI device of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + +#if HWLOC_API_VERSION < 0x00020000 + osdev = pcidev->first_child; +#else + osdev = pcidev->io_first_child; +#endif /* HWLOC_API_VERSION */ + for (; osdev != NULL; osdev = osdev->next_sibling) { + int i; + + if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { + const char *nguid = hwloc_obj_get_info_by_name(osdev, "NodeGUID"); + const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); + + if (!nguid && !sguid) + continue; + + for (i = 0; i < num_distances; i++) { + char lsguid[20], lnguid[20]; + int ret; + + if (PMIX_DEVTYPE_OPENFABRICS != distances[i].type) { + continue; + } + + if (!distances[i].osname || !osdev->name + || strcmp(distances[i].osname, osdev->name)) + continue; + + ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); + if (ret != 2) + continue; + + if ((nguid && (0 == strcasecmp(lnguid, nguid))) + || (sguid && (0 == strcasecmp(lsguid, sguid)))) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { + const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); + if (!address) + continue; + for (i = 0; i < num_distances; i++) { + if (PMIX_DEVTYPE_NETWORK != distances[i].type) { + continue; + } + char *addr = strstr(distances[i].uuid, "://"); + if (!addr || addr + 3 > distances[i].uuid + strlen(distances[i].uuid)) + continue; + if (!strcmp(addr + 3, address)) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } + } + + return OPAL_ERROR; +} +#else +static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topology, + pmix_device_distance_t *distances, size_t num_distances, + uint16_t *distance) +{ + return OPAL_ERROR; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + +/** + * @brief Get the nearest device to the current thread * - * @param num_distances (OUT) number of entries in the returned array + * Compute the distances from the current thread to each NIC in provider_list, + * and select the NIC with the shortest distance. + * If there are multiple equidistant devices, break the tie using local rank + * to balance NIC utilization. * - * @return An array of device distances which are nearest this thread - * or NULL if we fail to get the distances. In this case we will just - * revert to round robin. + * @param[in] topoloy hwloc topology + * @param[in] provider_list List of providers to select from + * @param[in] num_providers Number of providers in provider_list + * @param[in] rank local rank of the process + * @param[out] provider pointer to the selected provider * + * @return OPAL_SUCCESS if and only if a nearest provider is found. */ -static pmix_device_distance_t * -get_nearest_nics(int *num_distances, pmix_value_t **valin) +static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_list, + size_t num_providers, uint32_t rank, struct fi_info **provider) { - size_t ndist, i; - int ret, idx = 0; + int ret; pmix_data_array_t *dptr; - uint16_t near = USHRT_MAX; + pmix_device_distance_t *distances; pmix_info_t directive; pmix_value_t *val = NULL; - pmix_device_distance_t *distances, *nearest = NULL; + size_t ndist, num_nearest = 0; + struct fi_info *current_provider = NULL; + uint16_t dists[num_providers], *dist = NULL, min_dist = USHRT_MAX; + uint32_t provider_rank = 0; PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL); - ret = PMIx_Get(&opal_process_info.myprocid, - PMIX_DEVICE_DISTANCES, &directive, 1, &val); + ret = PMIx_Get(&opal_process_info.myprocid, PMIX_DEVICE_DISTANCES, &directive, 1, &val); PMIx_Info_destruct(&directive); if (ret != PMIX_SUCCESS || !val) { ret = compute_dev_distances(&distances, &ndist); if (ret) { + ret = OPAL_ERROR; goto out; } goto find_nearest; } if (PMIX_DATA_ARRAY != val->type) { + ret = OPAL_ERROR; goto out; } dptr = val->data.darray; if (NULL == dptr) { + ret = OPAL_ERROR; goto out; } if (PMIX_DEVICE_DIST != dptr->type) { + ret = OPAL_ERROR; goto out; } - distances = (pmix_device_distance_t*)dptr->array; + distances = (pmix_device_distance_t *) dptr->array; ndist = dptr->size; find_nearest: - nearest = calloc(sizeof(*distances), ndist); - if (!nearest) { - goto out; - } - - for (i = 0; i < ndist; i++) { - if (distances[i].type != PMIX_DEVTYPE_NETWORK && - distances[i].type != PMIX_DEVTYPE_OPENFABRICS) + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next, ++dist) { + if (OPAL_SUCCESS != check_provider_attr(provider_list, current_provider)) { continue; - if (distances[i].mindist < near) { - idx = 0; - near = distances[i].mindist; - nearest[idx] = distances[i]; - idx++; - } else if (distances[i].mindist == near) { - nearest[idx] = distances[i]; - idx++; + } + if (OPAL_SUCCESS != get_provider_distance(topology, current_provider, distances, ndist, dist)) { + *dist = USHRT_MAX; + } + + if (*dist < min_dist) { + min_dist = *dist; + num_nearest = 1; + } else if (*dist == min_dist) { + ++num_nearest; + } + + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + opal_output_verbose(1, opal_common_ofi.output, "provider: %s dist: %d", + current_provider->domain_attr->name, *dist); } } - *num_distances = idx; + ret = OPAL_ERROR; + if (0 >= num_nearest) { + return ret; + } + provider_rank = rank % num_nearest; + num_nearest = 0; + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && min_dist == *(dist++) && provider_rank == num_nearest++) { + *provider = current_provider; + ret = OPAL_SUCCESS; + goto out; + } + } out: - *valin = val; - return nearest; + if (val) + PMIx_Value_free(val, 1); + + return ret; } -#if OPAL_OFI_PCI_DATA_AVAILABLE /** - * Determine if a device is nearest + * @brief Selects a provider from the list in a round-robin fashion * - * Given a device distances array of the nearest pci devices, - * determine if one of these device distances refers to the pci - * device passed in - * - * @param distances (IN) distances array - * @param num_distances (IN) number of entries in the distances array - * @param topology (IN) topology of the node - * @param pci (IN) PCI device being examined - * - * @return true if the PCI device is in the distances array or if the - * distances array is not provided. False otherwise. + * This function implements a round-robin algorithm to select a provider from + * the provided list based on a rank. Only providers of the same type as the + * first provider are eligible for selection. * + * @param[in] provider_list A list of providers to select from. + * @param[out] rank A rank metric for the current process, such as + * the rank on the same node or CPU package. + * @return Pointer to the selected provider */ -#if HWLOC_API_VERSION < 0x00020000 -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank) { - return true; -} -#else -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - hwloc_obj_t pcidev, osdev; + uint32_t provider_rank = 0, current_rank = 0; + size_t num_providers = 0; + struct fi_info *current_provider = NULL; - /* if we failed to find any distances, then we consider all interfaces - * to be of equal distances and let the caller decide how to handle - * them - */ - if (!distances) - return true; - - pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, - pci.bus_id, pci.device_id, - pci.function_id); - if (!pcidev) - return false; - - for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) { - int i; - - if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID"); - const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); - - if (!nguid && !sguid) - continue; + for (current_provider = provider_list; NULL != current_provider;) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + ++num_providers; + } + current_provider = current_provider->next; + } - for (i = 0; i < num_distances; i++) { - char lsguid[20], lnguid[20]; - int ret; + current_provider = provider_list; + if (2 > num_providers) { + goto out; + } - if (!distances[i].osname || !osdev->name - || strcmp(distances[i].osname, osdev->name)) - continue; + provider_rank = rank % num_providers; - ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); - if (ret != 2) - continue; - if (nguid && (0 == strcasecmp(lnguid, nguid))) { - return true; - } else if (sguid && (0 == strcasecmp(lsguid, sguid))) { - return true; - } - } - } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); - if (!address) - continue; - for (i = 0; i < num_distances; i++) { - char *addr = strstr(distances[i].uuid, "://"); - if (!addr || addr + 3 > distances[i].uuid - + strlen(distances[i].uuid)) - continue; - if (!strcmp(addr+3, address)) { - return true; - } - } + while (NULL != current_provider) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && provider_rank == current_rank++) { + break; } + current_provider = current_provider->next; } - - return false; +out: + return current_provider; } -#endif -#endif // OPAL_OFI_PCI_DATA_AVAILABLE static int count_providers(struct fi_info *provider_list) { @@ -829,108 +914,50 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) } struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info) + opal_process_info_t *process_info) { - struct fi_info *provider = provider_list, *current_provider = provider_list; - struct fi_info **provider_table; -#if OPAL_OFI_PCI_DATA_AVAILABLE - pmix_device_distance_t *distances = NULL; - pmix_value_t *pmix_val; - struct fi_pci_attr pci; - int num_distances = 0; -#endif - bool near = false; - int ret; - unsigned int num_provider = 0, provider_limit = 0; - bool provider_found = false; + int ret, num_providers = 0; + struct fi_info *provider = NULL; + uint32_t package_rank = process_info->my_local_rank; + + num_providers = count_providers(provider_list); + if (!process_info->proc_is_bound || 2 > num_providers) { + goto round_robin; + } /* Initialize opal_hwloc_topology if it is not already */ ret = opal_hwloc_base_get_topology(); if (0 > ret) { /* Provider selection can continue but there is no guarantee of locality */ - opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology\n", + opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology", __FILE__, __LINE__); } - provider_limit = count_providers(provider_list); - - /* Allocate memory for provider table */ - provider_table = calloc(provider_limit, sizeof(struct fi_info *)); - if (NULL == provider_table) { - opal_output_verbose(1, opal_common_ofi.output, - "%s:%d:Failed to allocate memory for provider table\n", __FILE__, - __LINE__); - return provider_list; - } + package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE - /* find all the nearest devices to this thread, then out of these - * determine which device we should bind to. + /** + * If provider PCI BDF information is available, we calculate its physical distance + * to the current process, and select the provider with the shortest distance. */ - distances = get_nearest_nics(&num_distances, &pmix_val); -#endif - - current_provider = provider; - - /* Cycle through remaining fi_info objects, looking for alike providers */ - while (NULL != current_provider) { - if (!check_provider_attr(provider, current_provider)) { - near = false; -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != current_provider->nic - && NULL != current_provider->nic->bus_attr - && current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = current_provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); - } -#endif - /* We could have multiple near providers */ - if (near && !provider_found) { - provider_found = true; - num_provider = 0; - } - - /* Add the provider to the provider list if the cpusets match or if - * no other provider was found on the same cpuset as the process. - */ - if (near || !provider_found) { - provider_table[num_provider] = current_provider; - num_provider++; - } - } - current_provider = current_provider->next; - } - - /* Select provider from local rank % number of providers */ - uint32_t package_rank = get_package_rank(process_info); - if (num_provider >= 2) { - // If there are multiple NICs "close" to the process, try to calculate package_rank - provider = provider_table[package_rank % num_provider]; - } else if (num_provider == 1) { - provider = provider_table[num_provider - 1]; + ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, + &provider); + if (OPAL_SUCCESS == ret) { + goto out; } +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != provider->nic - && NULL != provider->nic->bus_attr - && provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); +round_robin: + if (!process_info->proc_is_bound && 1 < num_providers + && opal_output_get_verbosity(opal_common_ofi.output) >= 1) { + opal_show_help("help-common-ofi.txt", "unbound_process", true, 1); } -#endif + provider = select_provider_round_robin(provider_list, package_rank); +out: #if OPAL_ENABLE_DEBUG - opal_output_verbose(1, opal_common_ofi.output, - "package rank: %d device: %s near: %s\n", package_rank, - provider->domain_attr->name, near ? "true" : "false"); -#endif - - free(provider_table); -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (pmix_val) - PMIx_Value_free(pmix_val, 1); + opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, + provider->domain_attr->name); #endif return provider; } @@ -1000,5 +1027,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } - - diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 0bf114f5907..7118f6a3a01 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -135,47 +135,47 @@ OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provi /** * Selects NIC (provider) based on hardware locality * - * In multi-nic situations, use hardware topology to pick the "best" - * of the selected NICs. - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes - * are distributed evenly across NICs. This has two separate - * cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type - * that share the process cpuset) is used to select one - * of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the - * list, and the BDF data is not available (the ofi version does - * not support fi_info.nic or the provider does not support BDF) - * then (local rank % number of providers of the same type) is - * used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. + * The selection is based on the following priority: + * + * Single-NIC: + * + * If only 1 provider is available, always return that provider. + * + * Multi-NIC: + * + * 1. If the process is NOT bound, pick a NIC using (local rank % number + * of providers of the same type). This gives a fair chance to each + * qualified NIC and balances overall utilization. + * + * 2. If the process is bound, we compare providers in the list that have + * the same type as the first provider, and find the provider with the + * shortest distance to the current process. + * + * i. If the provider has PCI BDF data, we attempt to compute the + * distance between the NIC and the current process cpuset. The NIC + * with the shortest distance is returned. + * + * * For equidistant NICs, we select a NIC in round-robin fashion + * using the package rank of the current process, i.e. (package + * rank % number of providers with the same distance). + * + * ii. If we cannot compute the distance between the NIC and the + * current process, e.g. PCI BDF data is not available, a NIC will be + * selected in a round-robin fashion using package rank, i.e. (package + * rank % number of providers of the same type). + * + * @param[in] provider_list struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param[in] process_info opal_process_info_t* The current process info + * + * @param[out] provider struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. * * All errors should be recoverable and will return the initially provided * provider. However, if an error occurs we can no longer guarantee @@ -184,7 +184,7 @@ OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provi * */ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); + opal_process_info_t *process_info); /** * Obtain EP endpoint name diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt index 44366a64c5f..de3630f7e7a 100644 --- a/opal/mca/common/ofi/help-common-ofi.txt +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -7,6 +7,12 @@ # # $HEADER$ # +[unbound_process] +Open MPI's OFI driver detected multiple NICs on the system but cannot select an +optimal device because the current process is not bound. This may negatively +impact performance. This can be resolved by specifying "--bind-to ..." on +command line. + [package_rank failed] Open MPI's OFI driver detected multiple equidistant NICs from the current process, but had insufficient information to ensure MPI processes fairly pick a NIC for use.