Merge pull request conda-forge#9694 from jakirkham/add_nccl_final

Add NCCL
phadjido · Oct 22, 2019 · aecde18 · aecde18
2 parents 59309a0 + 38626dd
commit aecde18
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 0 deletions.
diff --git a/recipes/nccl/PR_220.patch b/recipes/nccl/PR_220.patch
@@ -0,0 +1,32 @@
+From e3c8188d461ebb45312ccbf23a81613192e00189 Mon Sep 17 00:00:00 2001
+From: John Kirkham <[email protected]>
+Date: Tue, 7 May 2019 17:29:39 -0400
+Subject: [PATCH] Allow CUDA runtime library selection
+
+Makes a change to allow the user to select between the static CUDA
+runtime library (default) and the dynamic CUDA runtime library. Does
+this by allowing `CUDARTLIB` to be overridden.
+---
+ src/Makefile | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/src/Makefile b/src/Makefile
+index 2d32dca78..bf5429cad 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -23,13 +23,14 @@ INCDIR := $(BUILDDIR)/include
+ LIBDIR := $(BUILDDIR)/lib
+ OBJDIR := $(BUILDDIR)/obj
+ ##### target files
++CUDARTLIB  ?= cudart_static
+ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+ LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+ STATICLIBTARGET := $(STATICLIBNAME)
+ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
+ DEPFILES   := $(LIBOBJ:%.o=%.d)
+-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
++LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+
+ DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+
diff --git a/recipes/nccl/build.sh b/recipes/nccl/build.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+################################################
+# Use `asm-generic/socket.h` from distro.      #
+# Needed to ensure `SO_REUSEPORT` is defined.  #
+# This feature was added in Linux kernel 3.9.  #
+# However RHEL 6 and CentOS 6 backported it.   #
+# The sysroot compilers get in the way so we   #
+# use this header from this system to define   #
+# `SO_REUSEPORT`.                              #
+#                                              #
+# ref: https://lwn.net/Articles/542629/        #
+################################################
+CONDA_BUILD_SYSROOT="$(${CC} --print-sysroot)"
+cp /usr/include/asm-generic/socket.h "${CONDA_BUILD_SYSROOT}/usr/include/asm-generic/socket.h"
+
+make -j${CPU_COUNT} CUDA_HOME="${CUDA_HOME}" CUDARTLIB="cudart"
+make install PREFIX="${PREFIX}"
diff --git a/recipes/nccl/meta.yaml b/recipes/nccl/meta.yaml
@@ -0,0 +1,59 @@
+{% set name = "nccl" %}
+{% set version = "2.4.6" %}
+{% set revision = "1" %}
+
+package:
+  name: {{ name|lower }}
+  version: {{ version }}.{{ revision }}
+
+source:
+  url: https://github.com/NVIDIA/nccl/archive/v{{ version }}-{{ revision }}.tar.gz
+  sha256: ea4421061a7b9c454f2e088f68bfdbbcefab80ce81cafc70ee6c7742b1439591
+  patches:
+    ########################################################################
+    # Patch to allow selection of static or dynamic CUDA runtime library.  #
+    #                                                                      #
+    # xref: https://github.com/NVIDIA/nccl/pull/220                        #
+    ########################################################################
+    - PR_220.patch
+
+build:
+  number: 0
+  skip: true  # [(not linux64) or (cuda_compiler_version == "None")]
+  run_exports:
+    # xref: https://github.com/NVIDIA/nccl/issues/218
+    - {{ pin_subpackage(name, max_pin="x") }}
+
+requirements:
+  build:
+    - {{ compiler("c") }}
+    - {{ compiler("cxx") }}
+    - {{ compiler("cuda") }}
+    - make
+
+test:
+  commands:
+    - test -f "${PREFIX}/include/nccl.h"
+    - test -f "${PREFIX}/lib/libnccl.so"
+    - test -f "${PREFIX}/lib/libnccl_static.a"
+
+about:
+  home: https://developer.nvidia.com/nccl
+  license: BSD-3-Clause
+  license_family: BSD
+  license_file: LICENSE.txt
+  summary: Optimized primitives for collective multi-GPU communication
+
+  description: |
+    The NVIDIA Collective Communications Library (NCCL) implements multi-GPU
+    and multi-node collective communication primitives that are performance
+    optimized for NVIDIA GPUs. NCCL provides routines such as all-gather,
+    all-reduce, broadcast, reduce, reduce-scatter, that are optimized to
+    achieve high bandwidth over PCIe and NVLink high-speed interconnect.
+
+  doc_url: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/index.html
+  dev_url: https://github.com/NVIDIA/nccl
+
+extra:
+  recipe-maintainers:
+    - jakirkham