From b0bd1943b329a4fd0106fe5d305d296964c84fa6 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Tue, 15 Aug 2023 04:05:22 +0000 Subject: [PATCH] python3Packages.torch: migrate to CUDA redist from CUDA Toolkit --- .../python-modules/torch/default.nix | 103 ++++++++++++------ 1 file changed, 68 insertions(+), 35 deletions(-) diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix index 0dcc2fdba2d9..1fa790686cac 100644 --- a/pkgs/development/python-modules/torch/default.nix +++ b/pkgs/development/python-modules/torch/default.nix @@ -1,4 +1,4 @@ -{ stdenv, lib, fetchFromGitHub, buildPythonPackage, python, +{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python, config, cudaSupport ? config.cudaSupport, cudaPackages, magma, useSystemNccl ? true, MPISupport ? false, mpi, @@ -52,17 +52,8 @@ let inherit (lib) lists strings trivial; - inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl; -in + inherit (cudaPackages) cudaFlags cudnn nccl; -assert cudaSupport -> stdenv.isLinux; -assert cudaSupport -> (cudaPackages.cudaMajorVersion == "11"); - -# confirm that cudatoolkits are sync'd across dependencies -assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit; -assert !cudaSupport || magma.cudaPackages.cudatoolkit == cudatoolkit; - -let setBool = v: if v then "1" else "0"; # https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744 @@ -103,23 +94,6 @@ let throw "No GPU targets specified" ); - cudatoolkit_joined = symlinkJoin { - name = "${cudatoolkit.name}-unsplit"; - # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs - paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ]; - }; - - # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via - # LD_LIBRARY_PATH=/run/opengl-driver/lib. We only use the stub - # libcuda.so from cudatoolkit for running tests, so that we don’t have - # to recompile pytorch on every update to nvidia-x11 or the kernel. - cudaStub = linkFarm "cuda-stub" [{ - name = "libcuda.so.1"; - path = "${cudatoolkit}/lib/stubs/libcuda.so"; - }]; - cudaStubEnv = lib.optionalString cudaSupport - "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH "; - rocmtoolkit_joined = symlinkJoin { name = "rocm-merged"; @@ -160,6 +134,12 @@ in buildPythonPackage rec { # base is 10.12. Until we upgrade, we can fall back on the older # pthread support. ./pthreadpool-disable-gcd.diff + ] ++ lib.optionals stdenv.isLinux [ + # Propagate CUPTI to Kineto by overriding the search path with environment variables. + (fetchpatch { + url = "https://github.com/pytorch/pytorch/pull/108847/commits/7ae4d7c0e2dec358b4fe81538efe9da5eb580ec9.patch"; + hash = "sha256-skFaDg98xcJqJfzxWk+qhUxPLHDStqvd0mec3PgksIg="; + }) ]; postPatch = lib.optionalString rocmSupport '' @@ -184,6 +164,13 @@ in buildPythonPackage rec { --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \ "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})" '' + # Detection of NCCL version doesn't work particularly well when using the static binary. + + lib.optionalString cudaSupport '' + substituteInPlace cmake/Modules/FindNCCL.cmake \ + --replace \ + 'message(FATAL_ERROR "Found NCCL header version and library version' \ + 'message(WARNING "Found NCCL header version and library version' + '' # error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc' # This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header. + lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") '' @@ -192,12 +179,16 @@ in buildPythonPackage rec { inline void *aligned_alloc(size_t align, size_t size)' ''; + # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken + # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time + # without extreme care to ensure they don't lock each other out of shared resources. + # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195. preConfigure = lib.optionalString cudaSupport '' export TORCH_CUDA_ARCH_LIST="${gpuTargetString}" - export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++ - '' + lib.optionalString (cudaSupport && cudnn != null) '' export CUDNN_INCLUDE_DIR=${cudnn.dev}/include export CUDNN_LIB_DIR=${cudnn.lib}/lib + export CUPTI_INCLUDE_DIR=${cudaPackages.cuda_cupti.dev}/include + export CUPTI_LIBRARY_DIR=${cudaPackages.cuda_cupti.lib}/lib '' + lib.optionalString rocmSupport '' export ROCM_PATH=${rocmtoolkit_joined} export ROCM_SOURCE_DIR=${rocmtoolkit_joined} @@ -256,6 +247,7 @@ in buildPythonPackage rec { PYTORCH_BUILD_NUMBER = 0; USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL + USE_STATIC_NCCL = setBool useSystemNccl; # Suppress a weird warning in mkl-dnn, part of ideep in pytorch # (upstream seems to have fixed this in the wrong place?) @@ -286,12 +278,43 @@ in buildPythonPackage rec { pybind11 pythonRelaxDepsHook removeReferencesTo - ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ] - ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; + ] ++ lib.optionals cudaSupport (with cudaPackages; [ + autoAddOpenGLRunpathHook + cuda_nvcc + ]) + ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; buildInputs = [ blas blas.provider pybind11 ] ++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now - ++ lib.optionals cudaSupport [ cudnn.dev cudnn.lib nccl ] + ++ lib.optionals cudaSupport (with cudaPackages; [ + cuda_cccl.dev # + cuda_cudart # cuda_runtime.h and libraries + cuda_cupti.dev # For kineto + cuda_cupti.lib # For kineto + cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too + cuda_nvml_dev.dev # + cuda_nvrtc.dev + cuda_nvrtc.lib + cuda_nvtx.dev + cuda_nvtx.lib # -llibNVToolsExt + cudnn.dev + cudnn.lib + libcublas.dev + libcublas.lib + libcufft.dev + libcufft.lib + libcurand.dev + libcurand.lib + libcusolver.dev + libcusolver.lib + libcusparse.dev + libcusparse.lib + nccl.dev # Provides nccl.h AND a static copy of NCCL! + ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ + cuda_nvprof.dev # + ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [ + cuda_profiler_api.dev # + ]) ++ lib.optionals rocmSupport [ openmp ] ++ lib.optionals (cudaSupport || rocmSupport) [ magma ] ++ lib.optionals stdenv.isLinux [ numactl ] @@ -335,7 +358,6 @@ in buildPythonPackage rec { checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [ "runHook preCheck" - cudaStubEnv "${python.interpreter} test/run_test.py" "--exclude" (concatStringsSep " " [ @@ -419,6 +441,17 @@ in buildPythonPackage rec { license = licenses.bsd3; maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin; - broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive + broken = builtins.any trivial.id [ + # CUDA and ROCm are mutually exclusive + (cudaSupport && rocmSupport) + # CUDA is only supported on Linux + (cudaSupport && !stdenv.isLinux) + # Only CUDA 11 is currently supported + (cudaSupport && (cudaPackages.cudaMajorVersion != "11")) + # MPI cudatoolkit does not match cudaPackages.cudatoolkit + (MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit)) + # Magma cudaPackages does not match cudaPackages + (cudaSupport && (magma.cudaPackages != cudaPackages)) + ]; }; }