30
30
31
31
#include "common_ofi.h"
32
32
#include "opal/constants.h"
33
+ #include "opal/mca/accelerator/accelerator.h"
33
34
#include "opal/mca/base/mca_base_framework.h"
34
35
#include "opal/mca/base/mca_base_var.h"
35
36
#include "opal/mca/hwloc/base/base.h"
38
39
#include "opal/util/argv.h"
39
40
#include "opal/util/show_help.h"
40
41
42
+ extern opal_accelerator_base_module_t opal_accelerator ;
41
43
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
42
44
.prov_exclude = NULL ,
43
45
.output = -1 };
@@ -778,6 +780,168 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
778
780
return (uint32_t ) process_info -> myprocid .rank ;
779
781
}
780
782
783
+ static int get_obj_depth (hwloc_obj_t obj , int * depth )
784
+ {
785
+ hwloc_obj_t parent = NULL ;
786
+ int depth_from_obj = 0 ;
787
+
788
+ /* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
789
+ if (0 <= (int ) obj -> depth ) {
790
+ * depth = obj -> depth ;
791
+ return OPAL_SUCCESS ;
792
+ }
793
+
794
+ parent = obj -> parent ;
795
+ while (parent ) {
796
+ ++ depth_from_obj ;
797
+ if (0 <= (int ) parent -> depth ) {
798
+ * depth = parent -> depth + depth_from_obj ;
799
+ return OPAL_SUCCESS ;
800
+ }
801
+ parent = obj -> parent ;
802
+ }
803
+
804
+ return OPAL_ERROR ;
805
+ }
806
+
807
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
808
+ /**
809
+ * @brief Attempt to find a nearest provider from the accelerator.
810
+ * Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
811
+ * shortest distance.
812
+ * Special cases:
813
+ * 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
814
+ * 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
815
+ * return OPAL_ERR_NOT_AVAILABLE.
816
+ * 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
817
+ * i.e. (local rank on the same accelerator) % (number of nearest providers)
818
+ * @param[in] provider_list linked list of providers
819
+ * @param[in] num_providers number of providers
820
+ * @param[in] device_rank local rank on the accelerator
821
+ * @param[out] provider pointer to the selected provider
822
+ * @return OPAL_SUCCESS if a provider is successfully selected
823
+ * OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
824
+ * OPAL_ERROR if a fatal error happened
825
+ */
826
+ static int find_nearest_provider_from_accelerator (struct fi_info * provider_list ,
827
+ size_t num_providers , uint32_t device_rank ,
828
+ struct fi_info * * provider )
829
+ {
830
+ hwloc_obj_t accl_dev = NULL , prov_dev = NULL , common_ancestor = NULL ;
831
+ int ret = -1 , accl_id = -1 , depth = -1 , max_common_ancestor_depth = -1 ;
832
+ opal_accelerator_pci_attr_t accl_pci_attr = {0 };
833
+ struct fi_info * current_provider = NULL ;
834
+ struct fi_pci_attr pci = {0 };
835
+ uint32_t near_provider_count = 0 , provider_rank = 0 ;
836
+ uint32_t distances [num_providers ], * distance = distances ;
837
+
838
+ memset (distances , 0 , sizeof (distances ));
839
+
840
+ ret = opal_accelerator .get_device (& accl_id );
841
+ if (OPAL_SUCCESS != ret ) {
842
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Accelerator is not available" ,
843
+ __FILE__ , __LINE__ );
844
+ return OPAL_ERR_NOT_AVAILABLE ;
845
+ }
846
+
847
+ ret = opal_accelerator .get_device_pci_attr (accl_id , & accl_pci_attr );
848
+ if (OPAL_SUCCESS != ret ) {
849
+ opal_output_verbose (1 , opal_common_ofi .output ,
850
+ "%s:%d:Accelerator PCI info is not available" , __FILE__ , __LINE__ );
851
+ return OPAL_ERROR ;
852
+ }
853
+
854
+ accl_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , accl_pci_attr .domain_id ,
855
+ accl_pci_attr .bus_id , accl_pci_attr .device_id ,
856
+ accl_pci_attr .function_id );
857
+ if (NULL == accl_dev ) {
858
+ opal_output_verbose (1 , opal_common_ofi .output ,
859
+ "%s:%d:Failed to find accelerator PCI device" , __FILE__ , __LINE__ );
860
+ return OPAL_ERROR ;
861
+ }
862
+
863
+ opal_output_verbose (1 , opal_common_ofi .output ,
864
+ "%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x" ,
865
+ __FILE__ , __LINE__ , accl_id , accl_pci_attr .domain_id , accl_pci_attr .bus_id ,
866
+ accl_pci_attr .device_id , accl_pci_attr .function_id ,
867
+ accl_dev -> attr -> pcidev .vendor_id , accl_dev -> attr -> pcidev .device_id );
868
+
869
+ current_provider = provider_list ;
870
+ while (NULL != current_provider ) {
871
+ common_ancestor = NULL ;
872
+ if (0 == check_provider_attr (provider_list , current_provider )
873
+ && (NULL != current_provider -> nic && NULL != current_provider -> nic -> bus_attr
874
+ && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI )) {
875
+ pci = current_provider -> nic -> bus_attr -> attr .pci ;
876
+
877
+ prov_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , pci .domain_id , pci .bus_id ,
878
+ pci .device_id , pci .function_id );
879
+ if (NULL == prov_dev ) {
880
+ opal_output_verbose (1 , opal_common_ofi .output ,
881
+ "%s:%d:Failed to find provider PCI device" , __FILE__ , __LINE__ );
882
+ return OPAL_ERROR ;
883
+ }
884
+
885
+ common_ancestor = hwloc_get_common_ancestor_obj (opal_hwloc_topology , accl_dev ,
886
+ prov_dev );
887
+ if (!common_ancestor ) {
888
+ opal_output_verbose (
889
+ 1 , opal_common_ofi .output ,
890
+ "%s:%d:Failed to find common ancestor of accelerator and provider PCI device" ,
891
+ __FILE__ , __LINE__ );
892
+ /**
893
+ * Return error because any 2 PCI devices should share at least one common ancestor,
894
+ * i.e. root
895
+ */
896
+ return OPAL_ERROR ;
897
+ }
898
+
899
+ ret = get_obj_depth (common_ancestor , & depth );
900
+ if (OPAL_SUCCESS != ret ) {
901
+ opal_output_verbose (1 , opal_common_ofi .output ,
902
+ "%s:%d:Failed to get common ancestor depth" , __FILE__ ,
903
+ __LINE__ );
904
+ return OPAL_ERROR ;
905
+ }
906
+
907
+ if (max_common_ancestor_depth < depth ) {
908
+ max_common_ancestor_depth = depth ;
909
+ near_provider_count = 1 ;
910
+ } else if (max_common_ancestor_depth == depth ) {
911
+ ++ near_provider_count ;
912
+ }
913
+ }
914
+
915
+ * (distance ++ ) = !common_ancestor ? 0 : depth ;
916
+ current_provider = current_provider -> next ;
917
+ }
918
+
919
+ if (0 == near_provider_count || 0 > max_common_ancestor_depth ) {
920
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Provider does not have PCI device" ,
921
+ __FILE__ , __LINE__ );
922
+ return OPAL_ERR_NOT_AVAILABLE ;
923
+ }
924
+
925
+ provider_rank = device_rank % near_provider_count ;
926
+
927
+ distance = distances ;
928
+ current_provider = provider_list ;
929
+ while (NULL != current_provider ) {
930
+ if (max_common_ancestor_depth == * (distance ++ ) && provider_rank == -- near_provider_count ) {
931
+ * provider = current_provider ;
932
+ return OPAL_SUCCESS ;
933
+ }
934
+
935
+ current_provider = current_provider -> next ;
936
+ }
937
+
938
+ assert (0 == near_provider_count );
939
+
940
+ return OPAL_ERROR ;
941
+ }
942
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
943
+
944
+
781
945
struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
782
946
opal_process_info_t * process_info )
783
947
{
@@ -793,6 +957,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793
957
int ret ;
794
958
unsigned int num_provider = 0 , provider_limit = 0 ;
795
959
bool provider_found = false;
960
+ uint32_t package_rank ;
796
961
797
962
/* Initialize opal_hwloc_topology if it is not already */
798
963
ret = opal_hwloc_base_get_topology ();
@@ -802,8 +967,29 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
802
967
__FILE__ , __LINE__ );
803
968
}
804
969
970
+ /* Current process' local rank on the same package(socket) */
971
+ package_rank = get_package_rank (process_info );
805
972
provider_limit = count_providers (provider_list );
806
973
974
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
975
+ /**
976
+ * If accelerator is enabled, select the closest provider to the accelerator.
977
+ * Note: the function expects a local rank on the accelerator to break ties if there are
978
+ * multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
979
+ */
980
+ ret = find_nearest_provider_from_accelerator (provider_list , provider_limit , package_rank ,
981
+ & provider );
982
+ if (!ret )
983
+ return provider ;
984
+
985
+ if (OPAL_ERR_NOT_AVAILABLE != ret ) {
986
+ opal_output_verbose (1 , opal_common_ofi .output ,
987
+ "%s:%d:Failed to find a provider close to the accelerator. Error: %d" ,
988
+ __FILE__ , __LINE__ , ret );
989
+ return provider_list ;
990
+ }
991
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
992
+
807
993
/* Allocate memory for provider table */
808
994
provider_table = calloc (provider_limit , sizeof (struct fi_info * ));
809
995
if (NULL == provider_table ) {
@@ -820,11 +1006,11 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
820
1006
distances = get_nearest_nics (& num_distances , & pmix_val );
821
1007
#endif
822
1008
823
- current_provider = provider ;
1009
+ current_provider = provider_list ;
824
1010
825
1011
/* Cycle through remaining fi_info objects, looking for alike providers */
826
1012
while (NULL != current_provider ) {
827
- if (!check_provider_attr (provider , current_provider )) {
1013
+ if (!check_provider_attr (provider_list , current_provider )) {
828
1014
near = false;
829
1015
#if OPAL_OFI_PCI_DATA_AVAILABLE
830
1016
if (NULL != current_provider -> nic
@@ -853,7 +1039,6 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
853
1039
}
854
1040
855
1041
/* Select provider from local rank % number of providers */
856
- uint32_t package_rank = get_package_rank (process_info );
857
1042
if (num_provider >= 2 ) {
858
1043
// If there are multiple NICs "close" to the process, try to calculate package_rank
859
1044
provider = provider_table [package_rank % num_provider ];
0 commit comments