Merge pull request #217367 from ConnorBaker/feat/cudaFlags-arch-rewrite
cudaFlags: rewrite to capture all architectures and fix NixOS#215436
This commit is contained in:
commit
d24dde7f6c
8 changed files with 272 additions and 60 deletions
|
@ -50,7 +50,7 @@ stdenv.mkDerivation rec {
|
|||
"-DUSE_OLDCMAKECUDA=ON" # see https://github.com/apache/incubator-mxnet/issues/10743
|
||||
"-DCUDA_ARCH_NAME=All"
|
||||
"-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
|
||||
"-DMXNET_CUDA_ARCH=${cudaFlags.cudaCapabilitiesSemiColonString}"
|
||||
"-DMXNET_CUDA_ARCH=${builtins.concatStringsSep ";" cudaFlags.cudaRealArches}"
|
||||
] else [ "-DUSE_CUDA=OFF" ])
|
||||
++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";
|
||||
|
||||
|
|
|
@ -2,7 +2,18 @@
|
|||
, lib
|
||||
, cudatoolkit
|
||||
}:
|
||||
|
||||
# Type aliases
|
||||
# Gpu = {
|
||||
# archName: String, # e.g., "Hopper"
|
||||
# computeCapability: String, # e.g., "9.0"
|
||||
# minCudaVersion: String, # e.g., "11.8"
|
||||
# maxCudaVersion: String, # e.g., "12.0"
|
||||
# }
|
||||
|
||||
let
|
||||
inherit (lib) attrsets lists strings trivial versions;
|
||||
cudaVersion = cudatoolkit.version;
|
||||
|
||||
# Flags are determined based on your CUDA toolkit by default. You may benefit
|
||||
# from improved performance, reduced file size, or greater hardware suppport by
|
||||
|
@ -13,66 +24,116 @@ let
|
|||
#
|
||||
# Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
|
||||
|
||||
defaultCudaCapabilities = rec {
|
||||
cuda9 = [
|
||||
"3.0"
|
||||
"3.5"
|
||||
"5.0"
|
||||
"5.2"
|
||||
"6.0"
|
||||
"6.1"
|
||||
"7.0"
|
||||
];
|
||||
# gpus :: List Gpu
|
||||
gpus = builtins.import ./gpus.nix;
|
||||
|
||||
cuda10 = cuda9 ++ [
|
||||
"7.5"
|
||||
];
|
||||
# isVersionIn :: Gpu -> Bool
|
||||
isSupported = gpu:
|
||||
let
|
||||
inherit (gpu) minCudaVersion maxCudaVersion;
|
||||
lowerBoundSatisfied = strings.versionAtLeast cudaVersion minCudaVersion;
|
||||
upperBoundSatisfied = !(strings.versionOlder maxCudaVersion cudaVersion);
|
||||
in
|
||||
lowerBoundSatisfied && upperBoundSatisfied;
|
||||
|
||||
cuda11 = [
|
||||
"3.5"
|
||||
"5.0"
|
||||
"5.2"
|
||||
"6.0"
|
||||
"6.1"
|
||||
"7.0"
|
||||
"7.5"
|
||||
"8.0"
|
||||
"8.6"
|
||||
];
|
||||
# supportedGpus :: List Gpu
|
||||
# GPUs which are supported by the provided CUDA version.
|
||||
supportedGpus = builtins.filter isSupported gpus;
|
||||
|
||||
};
|
||||
# cudaArchNameToVersions :: AttrSet String (List String)
|
||||
# Maps the name of a GPU architecture to different versions of that architecture.
|
||||
# For example, "Ampere" maps to [ "8.0" "8.6" "8.7" ].
|
||||
cudaArchNameToVersions =
|
||||
lists.groupBy'
|
||||
(versions: gpu: versions ++ [ gpu.computeCapability ])
|
||||
[ ]
|
||||
(gpu: gpu.archName)
|
||||
supportedGpus;
|
||||
|
||||
cudaMicroarchitectureNames = {
|
||||
"3" = "Kepler";
|
||||
"5" = "Maxwell";
|
||||
"6" = "Pascal";
|
||||
"7" = "Volta";
|
||||
"8" = "Ampere";
|
||||
"9" = "Hopper";
|
||||
};
|
||||
# cudaArchNames :: List String
|
||||
# NOTE: It's important that we don't rely on builtins.attrNames cudaArchNameToVersions here;
|
||||
# otherwise, we'll get the names sorted in alphabetical order. The JSON array we read them
|
||||
# from is already sorted, so we'll preserve that order here.
|
||||
cudaArchNames = lists.unique (lists.map (gpu: gpu.archName) supportedGpus);
|
||||
|
||||
defaultCudaArchList = defaultCudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
|
||||
cudaRealCapabilities = config.cudaCapabilities or defaultCudaArchList;
|
||||
capabilitiesForward = "${lib.last cudaRealCapabilities}+PTX";
|
||||
# cudaComputeCapabilityToName :: AttrSet String String
|
||||
# Maps the version of a GPU architecture to the name of that architecture.
|
||||
# For example, "8.0" maps to "Ampere".
|
||||
cudaComputeCapabilityToName = builtins.listToAttrs (
|
||||
lists.map
|
||||
(gpu: {
|
||||
name = gpu.computeCapability;
|
||||
value = gpu.archName;
|
||||
})
|
||||
supportedGpus
|
||||
);
|
||||
|
||||
dropDot = ver: builtins.replaceStrings ["."] [""] ver;
|
||||
# cudaComputeCapabilities :: List String
|
||||
# NOTE: It's important that we don't rely on builtins.attrNames cudaComputeCapabilityToName here;
|
||||
# otherwise, we'll get the versions sorted in alphabetical order. The JSON array we read them
|
||||
# from is already sorted, so we'll preserve that order here.
|
||||
# Use the user-provided list of CUDA capabilities if it's provided.
|
||||
cudaComputeCapabilities = config.cudaCapabilities
|
||||
or (lists.map (gpu: gpu.computeCapability) supportedGpus);
|
||||
|
||||
archMapper = feat: map (ver: "${feat}_${dropDot ver}");
|
||||
gencodeMapper = feat: map (ver: "-gencode=arch=compute_${dropDot ver},code=${feat}_${dropDot ver}");
|
||||
cudaRealArchs = archMapper "sm" cudaRealCapabilities;
|
||||
cudaPTXArchs = archMapper "compute" cudaRealCapabilities;
|
||||
cudaArchs = cudaRealArchs ++ [ (lib.last cudaPTXArchs) ];
|
||||
# cudaForwardComputeCapability :: String
|
||||
cudaForwardComputeCapability = (lists.last cudaComputeCapabilities) + "+PTX";
|
||||
|
||||
cudaArchNames = lib.unique (map (v: cudaMicroarchitectureNames.${lib.versions.major v}) cudaRealCapabilities);
|
||||
cudaCapabilities = cudaRealCapabilities ++ lib.optional (config.cudaForwardCompat or true) capabilitiesForward;
|
||||
cudaGencode = gencodeMapper "sm" cudaRealCapabilities ++ lib.optionals (config.cudaForwardCompat or true) (gencodeMapper "compute" [ (lib.last cudaPTXArchs) ]);
|
||||
# cudaComputeCapabilitiesAndForward :: List String
|
||||
# The list of supported CUDA architectures, including the forward compatibility architecture.
|
||||
# If forward compatibility is disabled, this will be the same as cudaComputeCapabilities.
|
||||
cudaComputeCapabilitiesAndForward = cudaComputeCapabilities
|
||||
++ lists.optional (config.cudaForwardCompat or true) cudaForwardComputeCapability;
|
||||
|
||||
cudaCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaCapabilities;
|
||||
cudaCapabilitiesSemiColonString = lib.strings.concatStringsSep ";" cudaCapabilities;
|
||||
cudaRealCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaRealCapabilities;
|
||||
# dropDot :: String -> String
|
||||
dropDot = ver: builtins.replaceStrings [ "." ] [ "" ] ver;
|
||||
|
||||
# archMapper :: String -> List String -> List String
|
||||
# Maps a feature across a list of architecture versions to produce a list of architectures.
|
||||
# For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "sm_80" "sm_86" "sm_87" ].
|
||||
archMapper = feat: lists.map (computeCapability: "${feat}_${dropDot computeCapability}");
|
||||
|
||||
# gencodeMapper :: String -> List String -> List String
|
||||
# Maps a feature across a list of architecture versions to produce a list of gencode arguments.
|
||||
# For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "-gencode=arch=compute_80,code=sm_80"
|
||||
# "-gencode=arch=compute_86,code=sm_86" "-gencode=arch=compute_87,code=sm_87" ].
|
||||
gencodeMapper = feat: lists.map (
|
||||
computeCapability:
|
||||
"-gencode=arch=compute_${dropDot computeCapability},code=${feat}_${dropDot computeCapability}"
|
||||
);
|
||||
|
||||
# cudaRealArches :: List String
|
||||
# The real architectures are physical architectures supported by the CUDA version.
|
||||
# For example, "sm_80".
|
||||
cudaRealArches = archMapper "sm" cudaComputeCapabilities;
|
||||
|
||||
# cudaVirtualArches :: List String
|
||||
# The virtual architectures are typically used for forward compatibility, when trying to support
|
||||
# an architecture newer than the CUDA version allows.
|
||||
# For example, "compute_80".
|
||||
cudaVirtualArches = archMapper "compute" cudaComputeCapabilities;
|
||||
|
||||
# cudaArches :: List String
|
||||
# By default, build for all supported architectures and forward compatibility via a virtual
|
||||
# architecture for the newest supported architecture.
|
||||
cudaArches = cudaRealArches ++
|
||||
lists.optional (config.cudaForwardCompat or true) (lists.last cudaVirtualArches);
|
||||
|
||||
# cudaGencode :: List String
|
||||
# A list of CUDA gencode arguments to pass to NVCC.
|
||||
cudaGencode =
|
||||
let
|
||||
base = gencodeMapper "sm" cudaComputeCapabilities;
|
||||
forwardCompat = gencodeMapper "compute" [ (lists.last cudaComputeCapabilities) ];
|
||||
in
|
||||
base ++ lists.optionals (config.cudaForwardCompat or true) forwardCompat;
|
||||
|
||||
in
|
||||
{
|
||||
inherit cudaArchs cudaArchNames cudaCapabilities cudaCapabilitiesCommaString cudaCapabilitiesSemiColonString
|
||||
cudaRealCapabilities cudaRealCapabilitiesCommaString cudaGencode cudaRealArchs cudaPTXArchs;
|
||||
inherit
|
||||
cudaArchNames
|
||||
cudaArchNameToVersions cudaComputeCapabilityToName
|
||||
cudaRealArches cudaVirtualArches cudaArches
|
||||
cudaGencode;
|
||||
cudaCapabilities = cudaComputeCapabilitiesAndForward;
|
||||
}
|
||||
|
|
110
pkgs/development/compilers/cudatoolkit/gpus.nix
Normal file
110
pkgs/development/compilers/cudatoolkit/gpus.nix
Normal file
|
@ -0,0 +1,110 @@
|
|||
[
|
||||
{
|
||||
archName = "Kepler";
|
||||
computeCapability = "3.0";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "10.2";
|
||||
}
|
||||
{
|
||||
archName = "Kepler";
|
||||
computeCapability = "3.2";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "10.2";
|
||||
}
|
||||
{
|
||||
archName = "Kepler";
|
||||
computeCapability = "3.5";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "11.8";
|
||||
}
|
||||
{
|
||||
archName = "Kepler";
|
||||
computeCapability = "3.7";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "11.8";
|
||||
}
|
||||
{
|
||||
archName = "Maxwell";
|
||||
computeCapability = "5.0";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Maxwell";
|
||||
computeCapability = "5.2";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Maxwell";
|
||||
computeCapability = "5.3";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Pascal";
|
||||
computeCapability = "6.0";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Pascal";
|
||||
computeCapability = "6.1";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Pascal";
|
||||
computeCapability = "6.2";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Volta";
|
||||
computeCapability = "7.0";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Volta";
|
||||
computeCapability = "7.2";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Turing";
|
||||
computeCapability = "7.5";
|
||||
minCudaVersion = "10.0";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Ampere";
|
||||
computeCapability = "8.0";
|
||||
minCudaVersion = "11.2";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Ampere";
|
||||
computeCapability = "8.6";
|
||||
minCudaVersion = "11.2";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Ampere";
|
||||
computeCapability = "8.7";
|
||||
minCudaVersion = "11.5";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Ada";
|
||||
computeCapability = "8.9";
|
||||
minCudaVersion = "11.8";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
{
|
||||
archName = "Hopper";
|
||||
computeCapability = "9.0";
|
||||
minCudaVersion = "11.8";
|
||||
maxCudaVersion = "12.0";
|
||||
}
|
||||
]
|
|
@ -52,7 +52,7 @@ in stdenv.mkDerivation (finalAttrs: {
|
|||
"-DCMAKE_C_COMPILER=${cudatoolkit.cc}/bin/gcc"
|
||||
"-DCMAKE_CXX_COMPILER=${cudatoolkit.cc}/bin/g++"
|
||||
"-DMAGMA_ENABLE_CUDA=ON"
|
||||
"-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArchs}"
|
||||
"-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
|
||||
] ++ lib.optionals useROCM [
|
||||
"-DCMAKE_C_COMPILER=${hip}/bin/hipcc"
|
||||
"-DCMAKE_CXX_COMPILER=${hip}/bin/hipcc"
|
||||
|
|
|
@ -164,7 +164,7 @@ let
|
|||
build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
|
||||
build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
|
||||
build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
|
||||
build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${cudaFlags.cudaRealCapabilitiesCommaString}"
|
||||
build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
|
||||
'' + ''
|
||||
CFG
|
||||
'';
|
||||
|
|
|
@ -301,7 +301,7 @@ let
|
|||
TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
|
||||
GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
|
||||
GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
|
||||
TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArchs;
|
||||
TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArches;
|
||||
|
||||
postPatch = ''
|
||||
# bazel 3.3 should work just as well as bazel 3.1
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
}:
|
||||
|
||||
let
|
||||
inherit (lib) lists strings trivial;
|
||||
inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
|
||||
in
|
||||
|
||||
|
@ -54,6 +55,45 @@ assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
|
|||
|
||||
let
|
||||
setBool = v: if v then "1" else "0";
|
||||
|
||||
# https://github.com/pytorch/pytorch/blob/v1.13.1/torch/utils/cpp_extension.py#L1751
|
||||
supportedTorchCudaCapabilities =
|
||||
let
|
||||
real = ["3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6"];
|
||||
ptx = lists.map (x: "${x}+PTX") real;
|
||||
in
|
||||
real ++ ptx;
|
||||
|
||||
# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
|
||||
# of the first list *from* the second list. That means:
|
||||
# lists.subtractLists a b = b - a
|
||||
|
||||
# For CUDA
|
||||
supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
|
||||
unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
|
||||
|
||||
# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
|
||||
gpuArchWarner = supported: unsupported:
|
||||
trivial.throwIf (supported == [ ])
|
||||
(
|
||||
"No supported GPU targets specified. Requested GPU targets: "
|
||||
+ strings.concatStringsSep ", " unsupported
|
||||
)
|
||||
supported;
|
||||
|
||||
# Create the gpuTargetString.
|
||||
gpuTargetString = strings.concatStringsSep ";" (
|
||||
if gpuTargets != [ ] then
|
||||
# If gpuTargets is specified, it always takes priority.
|
||||
gpuTargets
|
||||
else if cudaSupport then
|
||||
gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
|
||||
else if rocmSupport then
|
||||
hip.gpuTargets
|
||||
else
|
||||
throw "No GPU targets specified"
|
||||
);
|
||||
|
||||
cudatoolkit_joined = symlinkJoin {
|
||||
name = "${cudatoolkit.name}-unsplit";
|
||||
# nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
|
||||
|
@ -146,14 +186,14 @@ in buildPythonPackage rec {
|
|||
'';
|
||||
|
||||
preConfigure = lib.optionalString cudaSupport ''
|
||||
export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
|
||||
export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
|
||||
export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
|
||||
'' + lib.optionalString (cudaSupport && cudnn != null) ''
|
||||
export CUDNN_INCLUDE_DIR=${cudnn}/include
|
||||
'' + lib.optionalString rocmSupport ''
|
||||
export ROCM_PATH=${rocmtoolkit_joined}
|
||||
export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
|
||||
export PYTORCH_ROCM_ARCH="${lib.strings.concatStringsSep ";" (if gpuTargets == [ ] then hip.gpuTargets else gpuTargets)}"
|
||||
export PYTORCH_ROCM_ARCH="${gpuTargetString}"
|
||||
export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas"
|
||||
python tools/amd_build/build_amd.py
|
||||
'';
|
||||
|
@ -320,7 +360,8 @@ in buildPythonPackage rec {
|
|||
requiredSystemFeatures = [ "big-parallel" ];
|
||||
|
||||
passthru = {
|
||||
inherit cudaSupport cudaPackages;
|
||||
inherit cudaSupport cudaPackages gpuTargetString;
|
||||
cudaCapabilities = supportedCudaCapabilities;
|
||||
# At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
|
||||
blasProvider = blas.provider;
|
||||
};
|
||||
|
|
|
@ -15,13 +15,13 @@
|
|||
}:
|
||||
|
||||
let
|
||||
inherit (torch.cudaPackages) cudatoolkit cudaFlags cudnn;
|
||||
inherit (torch) gpuTargetString;
|
||||
inherit (torch.cudaPackages) cudatoolkit cudnn;
|
||||
|
||||
cudatoolkit_joined = symlinkJoin {
|
||||
name = "${cudatoolkit.name}-unsplit";
|
||||
paths = [ cudatoolkit.out cudatoolkit.lib ];
|
||||
};
|
||||
cudaArchStr = lib.optionalString cudaSupport lib.strings.concatStringsSep ";" torch.cudaArchList;
|
||||
in buildPythonPackage rec {
|
||||
pname = "torchvision";
|
||||
version = "0.14.1";
|
||||
|
@ -45,7 +45,7 @@ in buildPythonPackage rec {
|
|||
propagatedBuildInputs = [ numpy pillow torch scipy ];
|
||||
|
||||
preBuild = lib.optionalString cudaSupport ''
|
||||
export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
|
||||
export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
|
||||
export FORCE_CUDA=1
|
||||
'';
|
||||
|
||||
|
|
Loading…
Reference in a new issue