Merge pull request #249259 from ConnorBaker/feat/torch-use-cuda-redist

python3Packages.torch: migrate to CUDA redist from CUDA Toolkit
2023-09-18 08:27:38 -04:00 · 2023-09-18 08:27:38 -04:00 · 5c516a45c2
commit 5c516a45c2
parent aa1f7844f9 b0bd1943b3
1 changed files with 68 additions and 35 deletions
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@ -1,4 +1,4 @@
-{ stdenv, lib, fetchFromGitHub, buildPythonPackage, python,
+{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
  config, cudaSupport ? config.cudaSupport, cudaPackages, magma,
  useSystemNccl ? true,
  MPISupport ? false, mpi,
@ -52,17 +52,8 @@
 let
  inherit (lib) lists strings trivial;
-  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
+  inherit (cudaPackages) cudaFlags cudnn nccl;
 in
 assert cudaSupport -> stdenv.isLinux;
 assert cudaSupport -> (cudaPackages.cudaMajorVersion == "11");
 # confirm that cudatoolkits are sync'd across dependencies
 assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
 assert !cudaSupport || magma.cudaPackages.cudatoolkit == cudatoolkit;
 let
  setBool = v: if v then "1" else "0";
  # https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744
@ -103,23 +94,6 @@ let
      throw "No GPU targets specified"
  );
  cudatoolkit_joined = symlinkJoin {
    name = "${cudatoolkit.name}-unsplit";
    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
  };
  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
  # libcuda.so from cudatoolkit for running tests, so that we don’t have
  # to recompile pytorch on every update to nvidia-x11 or the kernel.
  cudaStub = linkFarm "cuda-stub" [{
    name = "libcuda.so.1";
    path = "${cudatoolkit}/lib/stubs/libcuda.so";
  }];
  cudaStubEnv = lib.optionalString cudaSupport
    "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
  rocmtoolkit_joined = symlinkJoin {
    name = "rocm-merged";
@ -160,6 +134,12 @@ in buildPythonPackage rec {
    # base is 10.12. Until we upgrade, we can fall back on the older
    # pthread support.
    ./pthreadpool-disable-gcd.diff
  ] ++ lib.optionals stdenv.isLinux [
    # Propagate CUPTI to Kineto by overriding the search path with environment variables.
    (fetchpatch {
      url = "https://github.com/pytorch/pytorch/pull/108847/commits/7ae4d7c0e2dec358b4fe81538efe9da5eb580ec9.patch";
      hash = "sha256-skFaDg98xcJqJfzxWk+qhUxPLHDStqvd0mec3PgksIg=";
    })
  ];
  postPatch = lib.optionalString rocmSupport ''
@ -184,6 +164,13 @@ in buildPythonPackage rec {
      --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
        "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
  ''
  # Detection of NCCL version doesn't work particularly well when using the static binary.
  + lib.optionalString cudaSupport ''
    substituteInPlace cmake/Modules/FindNCCL.cmake \
      --replace \
        'message(FATAL_ERROR "Found NCCL header version and library version' \
        'message(WARNING "Found NCCL header version and library version'
  ''
  # error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
  # This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
  + lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") ''
@ -192,12 +179,16 @@ in buildPythonPackage rec {
    inline void *aligned_alloc(size_t align, size_t size)'
  '';
  # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
  # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
  # without extreme care to ensure they don't lock each other out of shared resources.
  # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
  preConfigure = lib.optionalString cudaSupport ''
    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
  '' + lib.optionalString (cudaSupport && cudnn != null) ''
    export CUDNN_INCLUDE_DIR=${cudnn.dev}/include
    export CUDNN_LIB_DIR=${cudnn.lib}/lib
    export CUPTI_INCLUDE_DIR=${cudaPackages.cuda_cupti.dev}/include
    export CUPTI_LIBRARY_DIR=${cudaPackages.cuda_cupti.lib}/lib
  '' + lib.optionalString rocmSupport ''
    export ROCM_PATH=${rocmtoolkit_joined}
    export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
@ -256,6 +247,7 @@ in buildPythonPackage rec {
  PYTORCH_BUILD_NUMBER = 0;
  USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
  USE_STATIC_NCCL = setBool useSystemNccl;
  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
  # (upstream seems to have fixed this in the wrong place?)
@ -286,12 +278,43 @@ in buildPythonPackage rec {
    pybind11
    pythonRelaxDepsHook
    removeReferencesTo
-  ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ]
+  ] ++ lib.optionals cudaSupport (with cudaPackages; [
-    ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
+    autoAddOpenGLRunpathHook
    cuda_nvcc
  ])
  ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
  buildInputs = [ blas blas.provider pybind11 ]
    ++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
-    ++ lib.optionals cudaSupport [ cudnn.dev cudnn.lib nccl ]
+    ++ lib.optionals cudaSupport (with cudaPackages; [
      cuda_cccl.dev # <thrust/*>
      cuda_cudart # cuda_runtime.h and libraries
      cuda_cupti.dev # For kineto
      cuda_cupti.lib # For kineto
      cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too
      cuda_nvml_dev.dev # <nvml.h>
      cuda_nvrtc.dev
      cuda_nvrtc.lib
      cuda_nvtx.dev
      cuda_nvtx.lib # -llibNVToolsExt
      cudnn.dev
      cudnn.lib
      libcublas.dev
      libcublas.lib
      libcufft.dev
      libcufft.lib
      libcurand.dev
      libcurand.lib
      libcusolver.dev
      libcusolver.lib
      libcusparse.dev
      libcusparse.lib
      nccl.dev # Provides nccl.h AND a static copy of NCCL!
    ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
      cuda_nvprof.dev # <cuda_profiler_api.h>
    ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
      cuda_profiler_api.dev # <cuda_profiler_api.h>
    ])
    ++ lib.optionals rocmSupport [ openmp ]
    ++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
    ++ lib.optionals stdenv.isLinux [ numactl ]
@ -335,7 +358,6 @@ in buildPythonPackage rec {
  checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
    "runHook preCheck"
    cudaStubEnv
    "${python.interpreter} test/run_test.py"
    "--exclude"
    (concatStringsSep " " [
@ -419,6 +441,17 @@ in buildPythonPackage rec {
    license = licenses.bsd3;
    maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
    platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
-    broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
+    broken = builtins.any trivial.id [
      # CUDA and ROCm are mutually exclusive
      (cudaSupport && rocmSupport)
      # CUDA is only supported on Linux
      (cudaSupport && !stdenv.isLinux)
      # Only CUDA 11 is currently supported
      (cudaSupport && (cudaPackages.cudaMajorVersion != "11"))
      # MPI cudatoolkit does not match cudaPackages.cudatoolkit
      (MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit))
      # Magma cudaPackages does not match cudaPackages
      (cudaSupport && (magma.cudaPackages != cudaPackages))
    ];
  };
 }