python3Packages.torch: migrate to CUDA redist from CUDA Toolkit
This commit is contained in:
parent
9a12fb6936
commit
b0bd1943b3
1 changed files with 68 additions and 35 deletions
|
@ -1,4 +1,4 @@
|
||||||
{ stdenv, lib, fetchFromGitHub, buildPythonPackage, python,
|
{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
|
||||||
config, cudaSupport ? config.cudaSupport, cudaPackages, magma,
|
config, cudaSupport ? config.cudaSupport, cudaPackages, magma,
|
||||||
useSystemNccl ? true,
|
useSystemNccl ? true,
|
||||||
MPISupport ? false, mpi,
|
MPISupport ? false, mpi,
|
||||||
|
@ -52,17 +52,8 @@
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib) lists strings trivial;
|
inherit (lib) lists strings trivial;
|
||||||
inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
|
inherit (cudaPackages) cudaFlags cudnn nccl;
|
||||||
in
|
|
||||||
|
|
||||||
assert cudaSupport -> stdenv.isLinux;
|
|
||||||
assert cudaSupport -> (cudaPackages.cudaMajorVersion == "11");
|
|
||||||
|
|
||||||
# confirm that cudatoolkits are sync'd across dependencies
|
|
||||||
assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
|
|
||||||
assert !cudaSupport || magma.cudaPackages.cudatoolkit == cudatoolkit;
|
|
||||||
|
|
||||||
let
|
|
||||||
setBool = v: if v then "1" else "0";
|
setBool = v: if v then "1" else "0";
|
||||||
|
|
||||||
# https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744
|
# https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744
|
||||||
|
@ -103,23 +94,6 @@ let
|
||||||
throw "No GPU targets specified"
|
throw "No GPU targets specified"
|
||||||
);
|
);
|
||||||
|
|
||||||
cudatoolkit_joined = symlinkJoin {
|
|
||||||
name = "${cudatoolkit.name}-unsplit";
|
|
||||||
# nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
|
|
||||||
paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
|
|
||||||
};
|
|
||||||
|
|
||||||
# Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
|
|
||||||
# LD_LIBRARY_PATH=/run/opengl-driver/lib. We only use the stub
|
|
||||||
# libcuda.so from cudatoolkit for running tests, so that we don’t have
|
|
||||||
# to recompile pytorch on every update to nvidia-x11 or the kernel.
|
|
||||||
cudaStub = linkFarm "cuda-stub" [{
|
|
||||||
name = "libcuda.so.1";
|
|
||||||
path = "${cudatoolkit}/lib/stubs/libcuda.so";
|
|
||||||
}];
|
|
||||||
cudaStubEnv = lib.optionalString cudaSupport
|
|
||||||
"LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
|
|
||||||
|
|
||||||
rocmtoolkit_joined = symlinkJoin {
|
rocmtoolkit_joined = symlinkJoin {
|
||||||
name = "rocm-merged";
|
name = "rocm-merged";
|
||||||
|
|
||||||
|
@ -160,6 +134,12 @@ in buildPythonPackage rec {
|
||||||
# base is 10.12. Until we upgrade, we can fall back on the older
|
# base is 10.12. Until we upgrade, we can fall back on the older
|
||||||
# pthread support.
|
# pthread support.
|
||||||
./pthreadpool-disable-gcd.diff
|
./pthreadpool-disable-gcd.diff
|
||||||
|
] ++ lib.optionals stdenv.isLinux [
|
||||||
|
# Propagate CUPTI to Kineto by overriding the search path with environment variables.
|
||||||
|
(fetchpatch {
|
||||||
|
url = "https://github.com/pytorch/pytorch/pull/108847/commits/7ae4d7c0e2dec358b4fe81538efe9da5eb580ec9.patch";
|
||||||
|
hash = "sha256-skFaDg98xcJqJfzxWk+qhUxPLHDStqvd0mec3PgksIg=";
|
||||||
|
})
|
||||||
];
|
];
|
||||||
|
|
||||||
postPatch = lib.optionalString rocmSupport ''
|
postPatch = lib.optionalString rocmSupport ''
|
||||||
|
@ -184,6 +164,13 @@ in buildPythonPackage rec {
|
||||||
--replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
|
--replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
|
||||||
"set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
|
"set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
|
||||||
''
|
''
|
||||||
|
# Detection of NCCL version doesn't work particularly well when using the static binary.
|
||||||
|
+ lib.optionalString cudaSupport ''
|
||||||
|
substituteInPlace cmake/Modules/FindNCCL.cmake \
|
||||||
|
--replace \
|
||||||
|
'message(FATAL_ERROR "Found NCCL header version and library version' \
|
||||||
|
'message(WARNING "Found NCCL header version and library version'
|
||||||
|
''
|
||||||
# error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
|
# error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
|
||||||
# This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
|
# This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
|
||||||
+ lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") ''
|
+ lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") ''
|
||||||
|
@ -192,12 +179,16 @@ in buildPythonPackage rec {
|
||||||
inline void *aligned_alloc(size_t align, size_t size)'
|
inline void *aligned_alloc(size_t align, size_t size)'
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
|
||||||
|
# when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
|
||||||
|
# without extreme care to ensure they don't lock each other out of shared resources.
|
||||||
|
# For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
|
||||||
preConfigure = lib.optionalString cudaSupport ''
|
preConfigure = lib.optionalString cudaSupport ''
|
||||||
export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
|
export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
|
||||||
export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
|
|
||||||
'' + lib.optionalString (cudaSupport && cudnn != null) ''
|
|
||||||
export CUDNN_INCLUDE_DIR=${cudnn.dev}/include
|
export CUDNN_INCLUDE_DIR=${cudnn.dev}/include
|
||||||
export CUDNN_LIB_DIR=${cudnn.lib}/lib
|
export CUDNN_LIB_DIR=${cudnn.lib}/lib
|
||||||
|
export CUPTI_INCLUDE_DIR=${cudaPackages.cuda_cupti.dev}/include
|
||||||
|
export CUPTI_LIBRARY_DIR=${cudaPackages.cuda_cupti.lib}/lib
|
||||||
'' + lib.optionalString rocmSupport ''
|
'' + lib.optionalString rocmSupport ''
|
||||||
export ROCM_PATH=${rocmtoolkit_joined}
|
export ROCM_PATH=${rocmtoolkit_joined}
|
||||||
export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
|
export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
|
||||||
|
@ -256,6 +247,7 @@ in buildPythonPackage rec {
|
||||||
PYTORCH_BUILD_NUMBER = 0;
|
PYTORCH_BUILD_NUMBER = 0;
|
||||||
|
|
||||||
USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL
|
USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL
|
||||||
|
USE_STATIC_NCCL = setBool useSystemNccl;
|
||||||
|
|
||||||
# Suppress a weird warning in mkl-dnn, part of ideep in pytorch
|
# Suppress a weird warning in mkl-dnn, part of ideep in pytorch
|
||||||
# (upstream seems to have fixed this in the wrong place?)
|
# (upstream seems to have fixed this in the wrong place?)
|
||||||
|
@ -286,12 +278,43 @@ in buildPythonPackage rec {
|
||||||
pybind11
|
pybind11
|
||||||
pythonRelaxDepsHook
|
pythonRelaxDepsHook
|
||||||
removeReferencesTo
|
removeReferencesTo
|
||||||
] ++ lib.optionals cudaSupport [ cudatoolkit_joined ]
|
] ++ lib.optionals cudaSupport (with cudaPackages; [
|
||||||
|
autoAddOpenGLRunpathHook
|
||||||
|
cuda_nvcc
|
||||||
|
])
|
||||||
++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
|
++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
|
||||||
|
|
||||||
buildInputs = [ blas blas.provider pybind11 ]
|
buildInputs = [ blas blas.provider pybind11 ]
|
||||||
++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
|
++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
|
||||||
++ lib.optionals cudaSupport [ cudnn.dev cudnn.lib nccl ]
|
++ lib.optionals cudaSupport (with cudaPackages; [
|
||||||
|
cuda_cccl.dev # <thrust/*>
|
||||||
|
cuda_cudart # cuda_runtime.h and libraries
|
||||||
|
cuda_cupti.dev # For kineto
|
||||||
|
cuda_cupti.lib # For kineto
|
||||||
|
cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too
|
||||||
|
cuda_nvml_dev.dev # <nvml.h>
|
||||||
|
cuda_nvrtc.dev
|
||||||
|
cuda_nvrtc.lib
|
||||||
|
cuda_nvtx.dev
|
||||||
|
cuda_nvtx.lib # -llibNVToolsExt
|
||||||
|
cudnn.dev
|
||||||
|
cudnn.lib
|
||||||
|
libcublas.dev
|
||||||
|
libcublas.lib
|
||||||
|
libcufft.dev
|
||||||
|
libcufft.lib
|
||||||
|
libcurand.dev
|
||||||
|
libcurand.lib
|
||||||
|
libcusolver.dev
|
||||||
|
libcusolver.lib
|
||||||
|
libcusparse.dev
|
||||||
|
libcusparse.lib
|
||||||
|
nccl.dev # Provides nccl.h AND a static copy of NCCL!
|
||||||
|
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
|
||||||
|
cuda_nvprof.dev # <cuda_profiler_api.h>
|
||||||
|
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
|
||||||
|
cuda_profiler_api.dev # <cuda_profiler_api.h>
|
||||||
|
])
|
||||||
++ lib.optionals rocmSupport [ openmp ]
|
++ lib.optionals rocmSupport [ openmp ]
|
||||||
++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
|
++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
|
||||||
++ lib.optionals stdenv.isLinux [ numactl ]
|
++ lib.optionals stdenv.isLinux [ numactl ]
|
||||||
|
@ -335,7 +358,6 @@ in buildPythonPackage rec {
|
||||||
|
|
||||||
checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
|
checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
|
||||||
"runHook preCheck"
|
"runHook preCheck"
|
||||||
cudaStubEnv
|
|
||||||
"${python.interpreter} test/run_test.py"
|
"${python.interpreter} test/run_test.py"
|
||||||
"--exclude"
|
"--exclude"
|
||||||
(concatStringsSep " " [
|
(concatStringsSep " " [
|
||||||
|
@ -419,6 +441,17 @@ in buildPythonPackage rec {
|
||||||
license = licenses.bsd3;
|
license = licenses.bsd3;
|
||||||
maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
|
maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
|
||||||
platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
|
platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
|
||||||
broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
|
broken = builtins.any trivial.id [
|
||||||
|
# CUDA and ROCm are mutually exclusive
|
||||||
|
(cudaSupport && rocmSupport)
|
||||||
|
# CUDA is only supported on Linux
|
||||||
|
(cudaSupport && !stdenv.isLinux)
|
||||||
|
# Only CUDA 11 is currently supported
|
||||||
|
(cudaSupport && (cudaPackages.cudaMajorVersion != "11"))
|
||||||
|
# MPI cudatoolkit does not match cudaPackages.cudatoolkit
|
||||||
|
(MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit))
|
||||||
|
# Magma cudaPackages does not match cudaPackages
|
||||||
|
(cudaSupport && (magma.cudaPackages != cudaPackages))
|
||||||
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue