Skip to content

Commit

Permalink
Add NVIDIA Makefile support for multiple architectures ('fat binaries')
Browse files Browse the repository at this point in the history
  • Loading branch information
jfurtek committed Jul 20, 2017
1 parent 59d90a5 commit 8e1656c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,14 @@ You need to build the code for the appropriate architecture. By default, the arc
ARCH=sm_61 ## Just an example for Pascal architecture
```

In some cases, it may be useful to generate benchmarking executables for multiple architectures. For example, some systems may have multiple graphics processors with different architectures installed. The NVIDIA compiler (nvcc) supports the generation of "fat binaries" that contain intermediate and compiled code for multiple target architectures. To compile for multiple architectures, add a comma separated list of architectures to the `make` command line.

```
ARCH=sm_30,sm_32,sm_35,sm_50,sm_52,sm_60,sm_61 # Everything since Kepler!
```
Note that compilation for multiple architectures will take longer than compilation for a single architecture. Also, not all CUDA versions support all architectures. For example, support for sm_60 (and later) require CUDA 8.


For inference problems with `int8` precision, the convolution and gemm kernels need to be padded to be multiples of 4. By default, the kernels are padded and results are reported with padding. To disable padding, please use the following build option. When padding is disabled, the benchmark numbers aren't reported for the kernels that aren't supported.

```
Expand Down
14 changes: 8 additions & 6 deletions code/nvidia/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,38 @@ CONV_LIBRARY?=cudnn
CONV_PATH?=$
KERNELS_DIR=../kernels/
PAD_KERNELS?=1
COMMA=,
NVCC_ARCH_ARGS=$(foreach a,$(subst $(COMMA), ,$(ARCH)),--generate-code arch=$(patsubst sm_%,compute_%,$(a)),code=$(a))

.PHONY=all gemm conv rnn all_reduce nccl_single nccl_mpi clean

all: gemm conv rnn all_reduce

gemm:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) gemm_bench.cu -DPAD_KERNELS=$(PAD_KERNELS) -o $(BIN_DIR)/gemm_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -L $(BLAS_PATH) -l$(BLAS_LIBRARY) -L $(CUDA_LIB64) -lcurand -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) gemm_bench.cu -DPAD_KERNELS=$(PAD_KERNELS) -o $(BIN_DIR)/gemm_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -L $(BLAS_PATH) -l$(BLAS_LIBRARY) -L $(CUDA_LIB64) -lcurand $(NVCC_ARCH_ARGS) -std=c++11

conv:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) conv_bench.cu -DPAD_KERNELS=$(PAD_KERNELS) -o $(BIN_DIR)/conv_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -I $(CUDNN_PATH)/include/ -L $(CUDNN_PATH)/lib64/ -L $(CUDA_LIB64) -lcurand -lcudnn -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) conv_bench.cu -DPAD_KERNELS=$(PAD_KERNELS) -o $(BIN_DIR)/conv_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -I $(CUDNN_PATH)/include/ -L $(CUDNN_PATH)/lib64/ -L $(CUDA_LIB64) -lcurand -lcudnn $(NVCC_ARCH_ARGS) -std=c++11

rnn:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) rnn_bench.cu -o $(BIN_DIR)/rnn_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -I $(CUDNN_PATH)/include/ -L $(CUDNN_PATH)/lib64/ -L $(CUDA_LIB64) -lcurand -lcudnn -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) rnn_bench.cu -o $(BIN_DIR)/rnn_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -I $(CUDNN_PATH)/include/ -L $(CUDNN_PATH)/lib64/ -L $(CUDA_LIB64) -lcurand -lcudnn $(NVCC_ARCH_ARGS) -std=c++11

all_reduce: nccl_single nccl_mpi

nccl_single:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) nccl_single_all_reduce.cu -o $(BIN_DIR)/nccl_single_all_reduce -I $(NCCL_PATH)/build/include/ -I $(CUDNN_PATH)/include/ -L $(NCCL_PATH)/build/lib/ -L $(CUDNN_PATH)/lib64 -lnccl -lcudart -lcurand -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) nccl_single_all_reduce.cu -o $(BIN_DIR)/nccl_single_all_reduce -I $(NCCL_PATH)/build/include/ -I $(CUDNN_PATH)/include/ -L $(NCCL_PATH)/build/lib/ -L $(CUDNN_PATH)/lib64 -lnccl -lcudart -lcurand $(NVCC_ARCH_ARGS) -std=c++11

nccl_mpi:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) nccl_mpi_all_reduce.cu -o $(BIN_DIR)/nccl_mpi_all_reduce -I $(NCCL_PATH)/build/include/ -I $(CUDNN_PATH)/include/ -I $(MPI_PATH)/include -L $(NCCL_PATH)/build/lib/ -L $(CUDNN_PATH)/lib64 -L $(MPI_PATH)/lib -lnccl -lcurand -lcudart -lmpi -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) nccl_mpi_all_reduce.cu -o $(BIN_DIR)/nccl_mpi_all_reduce -I $(NCCL_PATH)/build/include/ -I $(CUDNN_PATH)/include/ -I $(MPI_PATH)/include -L $(NCCL_PATH)/build/lib/ -L $(CUDNN_PATH)/lib64 -L $(MPI_PATH)/lib -lnccl -lcurand -lcudart -lmpi $(NVCC_ARCH_ARGS) -std=c++11

sparse:
$(MKDIR) $(BIN_DIR)
$(CUDA_PATH)/bin/$(NVCC) sparse_bench.cu -o $(BIN_DIR)/sparse_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -L $(BLAS_PATH) -l$(BLAS_LIBRARY) -L $(CUDA_LIB64) -lcurand -lcusparse -arch=$(ARCH) -std=c++11
$(CUDA_PATH)/bin/$(NVCC) sparse_bench.cu -o $(BIN_DIR)/sparse_bench -I $(KERNELS_DIR) -I $(CUDA_PATH)/include -L $(BLAS_PATH) -l$(BLAS_LIBRARY) -L $(CUDA_LIB64) -lcurand -lcusparse $(NVCC_ARCH_ARGS) -std=c++11


clean:
Expand Down

0 comments on commit 8e1656c

Please sign in to comment.