Last time we analyzed how NCCL performs topology analysis of the machine’s PCI system, producing results in XML format. Next, NCCL will establish a graph based on this XML to facilitate subsequent path searching.
At the end of ncclTopoGetSystem, ncclTopoGetSystemFromXml is executed to convert the XML format into a graph format
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
return ncclSuccess;
}
First, it gets the root node “system” from the XML, then traverses the “cpu” child nodes. For each CPU, graph construction is performed through ncclTopoAddCpu. Here, each CPU actually represents a NUMA node.
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
struct ncclTopoNode* cpu;
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
const char* str;
NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if (str != NULL) {
NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
}
NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
}
}
for (int s=0; s<xmlCpu->nSubs; s++) {
struct ncclXmlNode* node = xmlCpu->subs[s];
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
if (strcmp(node->name, "nic") == 0) {
struct ncclTopoNode* nic = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
if (nic == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
}
NCCLCHECK(ncclTopoAddNic(node, system, nic));
}
}
return ncclSuccess;
}
Then it creates a CPU node with numaid as the ID, sets the CPU’s affinity (the cores corresponding to that NUMA), and sets CPU vendor information and other details.
Next, it traverses the CPU node’s child nodes and executes different functions based on their types. If it’s a PCI node, it executes ncclTopoAddPci
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
const char* str;
int type;
NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
int64_t busId;
NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
NCCLCHECK(busIdToInt64(str, &busId));
struct ncclTopoNode* node = NULL;
if (type == GPU) {
struct ncclXmlNode* xmlGpu;
NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
if (xmlGpu == NULL) return ncclSuccess;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if (index == -1) return ncclSuccess;
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
}
if (type == NIC) {
struct ncclXmlNode* xmlNic;
NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
if (xmlNic == NULL) return ncclSuccess;
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct ncclTopoNode* nicNode = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
if (nicNode == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
node = nicNode; // Connect it to parent later on
}
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
}
}
if (node) {
int width, speed;
NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
// Manage cases where speed was not indicated in /sys
if (width == 0) width = 16;
NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
}
return ncclSuccess;
}
First, it gets the PCI type and busId, then checks the type. If it’s PCI, it creates a PCI node and recursively executes ncclTopoAddPci until it encounters NIC or GPU XML nodes.
If it encounters a NIC, it creates a NIC node and executes ncclTopoAddNic. This will traverse XML net nodes under the XML NIC, creating net nodes for each XML net with dev as the ID, then sets speed, port, GDR, and other properties
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
int dev;
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct ncclTopoNode* net;
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
const char* str;
NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if (str) sscanf(str, "0x%lx", &net->net.asic);
else net->net.asic = dev;
ncclDebugNoWarn = NCCL_GRAPH;
int mbps;
if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
net->net.width = mbps / 8000.0;
if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
ncclDebugNoWarn = 0;
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
return ncclSuccess;
}
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
for (int s=0; s<xmlNic->nSubs; s++) {
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
if (strcmp(xmlNet->name, "net") != 0) continue;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if (index == -1) continue;
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
}
return ncclSuccess;
}
Then it establishes bidirectional edges between net nodes and NIC nodes, sets edge types and cumulative bandwidth, and sorts the current node’s edges by bandwidth in descending order
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
// Aggregate links into higher width for NVLink
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
link->width += width;
// Sort links in BW descending order
struct ncclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
while (link != node->links) {
if ((link-1)->width >= linkSave.width) break;
memcpy(link, link-1, sizeof(struct ncclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
At this point, NIC addition is complete. Back in ncclTopoAddPci, if it’s a GPU, it creates a GPU node and sets its rank, dev, GDR, and other properties. Finally, it establishes bidirectional edges between the current node and child nodes through ncclTopoConnectNodes.
This completes the graph construction under each NUMA node, then it starts adding NVLink and QPI connections. Let’s look at NVLink first
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
if (strcmp(node->name, "nvlink") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
return ncclInternalError;
}
int count;
NCCLCHECK(xmlGetAttrInt(node, "count", &count));
const char* targetClass;
NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
int targetType;
NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
struct ncclTopoNode* remote = NULL;
if (targetType == GPU) {
// NVL P2P connection to another GPU
const char* target;
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
NCCLCHECK(busIdToInt64(target, &busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
} else if (targetType == CPU) {
// NVL connection to the local CPU
NCCLCHECK(findLocalCpu(gpu, &remote));
} else {
if (system->nodes[NVS].count == 0) {
NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
} else {
remote = system->nodes[NVS].nodes;
}
}
if (remote) {
int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
if (remote->type != GPU) {
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
}
}
} else {
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
}
}
return ncclSuccess;
}
It recursively traverses from the root node until encountering NVLink XML nodes, then gets the NVLink’s parent node (GPU node). Through tclass, it gets the peer PCI device type. If it’s a GPU or CPU, it directly returns the peer node. If it’s an NVSwitch, it first creates an NVSwitch node, then creates bidirectional edges between the current GPU node and the peer. Then it connects CPUs pairwise through ncclTopoConnectCpus.
Finally, to facilitate subsequent channel searching, it uses ncclTopoSort to recursively sort each PCI node’s edges in the order of NVLink, downward PCI connections, upward PCI connections, and QPI. Since edges were already sorted by bandwidth during edge creation, NVLinks are always first and QPI always last, so only the middle PCI connections need sorting
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
// Shift all links to have upLink as last link
if (upNode) {
int l=0;
while (node->links[l].remNode != upNode) l++;
struct ncclTopoLink upLink;
memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
while (node->links[l+1].remNode) {
memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
l++;
}
memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
}
// Recursively sort the PCI tree
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
}
return ncclSuccess;
}
This completes the entire graph construction process. To summarize, since the topology analysis output in XML format is not convenient for subsequent path searching, this section performed graph construction of the PCI system based on the XML.