Skip to content

Commit

Permalink
Merge pull request NVIDIA#283 from lukeyeager/topo-trim-net-links
Browse files Browse the repository at this point in the history
Topo trim net links
  • Loading branch information
sjeaugey authored Jan 16, 2020
2 parents 3899f6e + 7a18fe0 commit 44c34e5
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
7 changes: 4 additions & 3 deletions makefiles/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ else
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})

CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
CXXFLAGS += -I $(CUDA_INC)
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
-Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
-I $(CUDA_INC) \
$(CXXFLAGS)
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
Expand Down
23 changes: 23 additions & 0 deletions src/graph/paths.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,18 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for (int n=0; n<system->nodes[nodeType].count; n++) {
struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
free(node->paths[t]);
node->paths[t] = NULL;
}
}
}

Expand Down Expand Up @@ -309,6 +316,22 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
// Trim network
ncclTopoRemovePathType(system, NET);
system->nodes[NET].count = 0;
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = &(node->links[l]);
if (link->remNode->type == NET) {
// Remove the link
for (int i=l; i<(node->nlinks-1); i++) {
memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
}
node->nlinks--;
l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
}
}
}
}
}
free(domains);
free(ids);
Expand Down

0 comments on commit 44c34e5

Please sign in to comment.