nixpkgs/pkgs/development/cuda-modules/nccl/default.nix

# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in
# the names of dependencies from that package set directly to avoid evaluation errors
# in the case redistributable packages are not available.
{
  lib,
  fetchFromGitHub,
  python3,
  which,
  autoAddDriverRunpath,
  cudaPackages,
  # passthru.updateScript
  gitUpdater,
}:
let
  inherit (cudaPackages)
    backendStdenv
    cuda_cccl
    cuda_cudart
    cuda_nvcc
    cudaAtLeast
    cudaFlags
    cudaOlder
    cudatoolkit
    ;
in
backendStdenv.mkDerivation (finalAttrs: {
  pname = "nccl";
  version = "2.21.5-1";

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = finalAttrs.pname;
    rev = "v${finalAttrs.version}";
    hash = "sha256-IF2tILwW8XnzSmfn7N1CO7jXL95gUp02guIW5n1eaig=";
  };

  __structuredAttrs = true;
  strictDeps = true;

  outputs = [
    "out"
    "dev"
  ];

  nativeBuildInputs =
    [
      which
      autoAddDriverRunpath
      python3
    ]
    ++ lib.optionals (cudaOlder "11.4") [ cudatoolkit ]
    ++ lib.optionals (cudaAtLeast "11.4") [ cuda_nvcc ];

  buildInputs =
    lib.optionals (cudaOlder "11.4") [ cudatoolkit ]
    ++ lib.optionals (cudaAtLeast "11.4") [
      cuda_nvcc.dev # crt/host_config.h
      cuda_cudart
    ]
    # NOTE: CUDA versions in Nixpkgs only use a major and minor version. When we do comparisons
    # against other version, like below, it's important that we use the same format. Otherwise,
    # we'll get incorrect results.
    # For example, lib.versionAtLeast "12.0" "12.0.0" == false.
    ++ lib.optionals (cudaAtLeast "12.0") [ cuda_cccl ];

  env.NIX_CFLAGS_COMPILE = toString [ "-Wno-unused-function" ];

  postPatch = ''
    patchShebangs ./src/device/generate.py
  '';

  makeFlagsArray =
    [
      "PREFIX=$(out)"
      "NVCC_GENCODE=${cudaFlags.gencodeString}"
    ]
    ++ lib.optionals (cudaOlder "11.4") [
      "CUDA_HOME=${cudatoolkit}"
      "CUDA_LIB=${lib.getLib cudatoolkit}/lib"
      "CUDA_INC=${lib.getDev cudatoolkit}/include"
    ]
    ++ lib.optionals (cudaAtLeast "11.4") [
      "CUDA_HOME=${cuda_nvcc}"
      "CUDA_LIB=${lib.getLib cuda_cudart}/lib"
      "CUDA_INC=${lib.getDev cuda_cudart}/include"
    ];

  enableParallelBuilding = true;

  postFixup = ''
    moveToOutput lib/libnccl_static.a $dev
  '';

  passthru.updateScript = gitUpdater {
    inherit (finalAttrs) pname version;
    rev-prefix = "v";
  };

  meta = with lib; {
    description = "Multi-GPU and multi-node collective communication primitives for NVIDIA GPUs";
    homepage = "https://developer.nvidia.com/nccl";
    license = licenses.bsd3;
    platforms = platforms.linux;
    # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
    # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
    badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ];
    maintainers =
      with maintainers;
      [
        mdaiter
        orivej
      ]
      ++ teams.cuda.members;
  };
})