Merge pull request #217367 from ConnorBaker/feat/cudaFlags-arch-rewrite

cudaFlags: rewrite to capture all architectures and fix NixOS#215436
2023-02-24 17:31:11 -05:00 · 2023-02-24 17:31:11 -05:00 · d24dde7f6c
commit d24dde7f6c
parent 2a5c35ec61 dddc103fdb
8 changed files with 272 additions and 60 deletions
--- a/pkgs/applications/science/math/mxnet/default.nix
+++ b/pkgs/applications/science/math/mxnet/default.nix
@ -50,7 +50,7 @@ stdenv.mkDerivation rec {
      "-DUSE_OLDCMAKECUDA=ON"  # see https://github.com/apache/incubator-mxnet/issues/10743
      "-DCUDA_ARCH_NAME=All"
      "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
-      "-DMXNET_CUDA_ARCH=${cudaFlags.cudaCapabilitiesSemiColonString}"
+      "-DMXNET_CUDA_ARCH=${builtins.concatStringsSep ";" cudaFlags.cudaRealArches}"
    ] else [ "-DUSE_CUDA=OFF" ])
    ++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";

--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@ -2,7 +2,18 @@
 , lib
 , cudatoolkit
 }:
+
+# Type aliases
+# Gpu = {
+#   archName: String, # e.g., "Hopper"
+#   computeCapability: String, # e.g., "9.0"
+#   minCudaVersion: String, # e.g., "11.8"
+#   maxCudaVersion: String, # e.g., "12.0"
+# }
+
 let
+  inherit (lib) attrsets lists strings trivial versions;
+  cudaVersion = cudatoolkit.version;

  # Flags are determined based on your CUDA toolkit by default.  You may benefit
  # from improved performance, reduced file size, or greater hardware suppport by
@ -13,66 +24,116 @@ let
  #
  # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351

-  defaultCudaCapabilities = rec {
-    cuda9 = [
-      "3.0"
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-    ];
+  # gpus :: List Gpu
+  gpus = builtins.import ./gpus.nix;

-    cuda10 = cuda9 ++ [
-      "7.5"
-    ];
+  # isVersionIn :: Gpu -> Bool
+  isSupported = gpu:
+    let
+      inherit (gpu) minCudaVersion maxCudaVersion;
+      lowerBoundSatisfied = strings.versionAtLeast cudaVersion minCudaVersion;
+      upperBoundSatisfied = !(strings.versionOlder maxCudaVersion cudaVersion);
+    in
+    lowerBoundSatisfied && upperBoundSatisfied;

-    cuda11 = [
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-      "7.5"
-      "8.0"
-      "8.6"
-    ];
+  # supportedGpus :: List Gpu
+  # GPUs which are supported by the provided CUDA version.
+  supportedGpus = builtins.filter isSupported gpus;

-  };
+  # cudaArchNameToVersions :: AttrSet String (List String)
+  # Maps the name of a GPU architecture to different versions of that architecture.
+  # For example, "Ampere" maps to [ "8.0" "8.6" "8.7" ].
+  cudaArchNameToVersions =
+    lists.groupBy'
+      (versions: gpu: versions ++ [ gpu.computeCapability ])
+      [ ]
+      (gpu: gpu.archName)
+      supportedGpus;

-  cudaMicroarchitectureNames = {
-    "3" = "Kepler";
-    "5" = "Maxwell";
-    "6" = "Pascal";
-    "7" = "Volta";
-    "8" = "Ampere";
-    "9" = "Hopper";
-  };
+  # cudaArchNames :: List String
+  # NOTE: It's important that we don't rely on builtins.attrNames cudaArchNameToVersions here;
+  #   otherwise, we'll get the names sorted in alphabetical order. The JSON array we read them
+  #   from is already sorted, so we'll preserve that order here.
+  cudaArchNames = lists.unique (lists.map (gpu: gpu.archName) supportedGpus);

-  defaultCudaArchList = defaultCudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-  cudaRealCapabilities = config.cudaCapabilities or defaultCudaArchList;
-  capabilitiesForward = "${lib.last cudaRealCapabilities}+PTX";
+  # cudaComputeCapabilityToName :: AttrSet String String
+  # Maps the version of a GPU architecture to the name of that architecture.
+  # For example, "8.0" maps to "Ampere".
+  cudaComputeCapabilityToName = builtins.listToAttrs (
+    lists.map
+      (gpu: {
+        name = gpu.computeCapability;
+        value = gpu.archName;
+      })
+      supportedGpus
+  );

-  dropDot = ver: builtins.replaceStrings ["."] [""] ver;
+  # cudaComputeCapabilities :: List String
+  # NOTE: It's important that we don't rely on builtins.attrNames cudaComputeCapabilityToName here;
+  #   otherwise, we'll get the versions sorted in alphabetical order. The JSON array we read them
+  #   from is already sorted, so we'll preserve that order here.
+  # Use the user-provided list of CUDA capabilities if it's provided.
+  cudaComputeCapabilities = config.cudaCapabilities
+    or (lists.map (gpu: gpu.computeCapability) supportedGpus);

-  archMapper = feat: map (ver: "${feat}_${dropDot ver}");
-  gencodeMapper = feat: map (ver: "-gencode=arch=compute_${dropDot ver},code=${feat}_${dropDot ver}");
-  cudaRealArchs = archMapper "sm" cudaRealCapabilities;
-  cudaPTXArchs = archMapper "compute" cudaRealCapabilities;
-  cudaArchs = cudaRealArchs ++ [ (lib.last cudaPTXArchs) ];
+  # cudaForwardComputeCapability :: String
+  cudaForwardComputeCapability = (lists.last cudaComputeCapabilities) + "+PTX";

-  cudaArchNames = lib.unique (map (v: cudaMicroarchitectureNames.${lib.versions.major v}) cudaRealCapabilities);
-  cudaCapabilities = cudaRealCapabilities ++ lib.optional (config.cudaForwardCompat or true) capabilitiesForward;
-  cudaGencode = gencodeMapper "sm" cudaRealCapabilities ++ lib.optionals (config.cudaForwardCompat or true) (gencodeMapper "compute" [ (lib.last cudaPTXArchs) ]);
+  # cudaComputeCapabilitiesAndForward :: List String
+  # The list of supported CUDA architectures, including the forward compatibility architecture.
+  # If forward compatibility is disabled, this will be the same as cudaComputeCapabilities.
+  cudaComputeCapabilitiesAndForward = cudaComputeCapabilities
+    ++ lists.optional (config.cudaForwardCompat or true) cudaForwardComputeCapability;

-  cudaCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaCapabilities;
-  cudaCapabilitiesSemiColonString = lib.strings.concatStringsSep ";" cudaCapabilities;
-  cudaRealCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaRealCapabilities;
+  # dropDot :: String -> String
+  dropDot = ver: builtins.replaceStrings [ "." ] [ "" ] ver;
+
+  # archMapper :: String -> List String -> List String
+  # Maps a feature across a list of architecture versions to produce a list of architectures.
+  # For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "sm_80" "sm_86" "sm_87" ].
+  archMapper = feat: lists.map (computeCapability: "${feat}_${dropDot computeCapability}");
+
+  # gencodeMapper :: String -> List String -> List String
+  # Maps a feature across a list of architecture versions to produce a list of gencode arguments.
+  # For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "-gencode=arch=compute_80,code=sm_80"
+  # "-gencode=arch=compute_86,code=sm_86" "-gencode=arch=compute_87,code=sm_87" ].
+  gencodeMapper = feat: lists.map (
+    computeCapability:
+    "-gencode=arch=compute_${dropDot computeCapability},code=${feat}_${dropDot computeCapability}"
+  );
+
+  # cudaRealArches :: List String
+  # The real architectures are physical architectures supported by the CUDA version.
+  # For example, "sm_80".
+  cudaRealArches = archMapper "sm" cudaComputeCapabilities;
+
+  # cudaVirtualArches :: List String
+  # The virtual architectures are typically used for forward compatibility, when trying to support
+  # an architecture newer than the CUDA version allows.
+  # For example, "compute_80".
+  cudaVirtualArches = archMapper "compute" cudaComputeCapabilities;
+
+  # cudaArches :: List String
+  # By default, build for all supported architectures and forward compatibility via a virtual
+  # architecture for the newest supported architecture.
+  cudaArches = cudaRealArches ++
+    lists.optional (config.cudaForwardCompat or true) (lists.last cudaVirtualArches);
+
+  # cudaGencode :: List String
+  # A list of CUDA gencode arguments to pass to NVCC.
+  cudaGencode =
+    let
+      base = gencodeMapper "sm" cudaComputeCapabilities;
+      forwardCompat = gencodeMapper "compute" [ (lists.last cudaComputeCapabilities) ];
+    in
+    base ++ lists.optionals (config.cudaForwardCompat or true) forwardCompat;

 in
 {
-   inherit cudaArchs cudaArchNames cudaCapabilities cudaCapabilitiesCommaString cudaCapabilitiesSemiColonString
-     cudaRealCapabilities cudaRealCapabilitiesCommaString cudaGencode cudaRealArchs cudaPTXArchs;
+  inherit
+    cudaArchNames
+    cudaArchNameToVersions cudaComputeCapabilityToName
+    cudaRealArches cudaVirtualArches cudaArches
+    cudaGencode;
+  cudaCapabilities = cudaComputeCapabilitiesAndForward;
 }
--- a/pkgs/development/compilers/cudatoolkit/gpus.nix
+++ b/pkgs/development/compilers/cudatoolkit/gpus.nix
@ -0,0 +1,110 @@
+[
+  {
+    archName = "Kepler";
+    computeCapability = "3.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "10.2";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "10.2";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.5";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "11.8";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.7";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "11.8";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.3";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.1";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Volta";
+    computeCapability = "7.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Volta";
+    computeCapability = "7.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Turing";
+    computeCapability = "7.5";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.0";
+    minCudaVersion = "11.2";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.6";
+    minCudaVersion = "11.2";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.7";
+    minCudaVersion = "11.5";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ada";
+    computeCapability = "8.9";
+    minCudaVersion = "11.8";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Hopper";
+    computeCapability = "9.0";
+    minCudaVersion = "11.8";
+    maxCudaVersion = "12.0";
+  }
+]
--- a/pkgs/development/libraries/science/math/magma/default.nix
+++ b/pkgs/development/libraries/science/math/magma/default.nix
@ -52,7 +52,7 @@ in stdenv.mkDerivation (finalAttrs: {
    "-DCMAKE_C_COMPILER=${cudatoolkit.cc}/bin/gcc"
    "-DCMAKE_CXX_COMPILER=${cudatoolkit.cc}/bin/g++"
    "-DMAGMA_ENABLE_CUDA=ON"
-    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArchs}"
+    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
  ] ++ lib.optionals useROCM [
    "-DCMAKE_C_COMPILER=${hip}/bin/hipcc"
    "-DCMAKE_CXX_COMPILER=${hip}/bin/hipcc"
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@ -164,7 +164,7 @@ let
      build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
      build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
      build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
-      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${cudaFlags.cudaRealCapabilitiesCommaString}"
+      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
    '' + ''
      CFG
    '';
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@ -301,7 +301,7 @@ let
    TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
-    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArchs;
+    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArches;

    postPatch = ''
      # bazel 3.3 should work just as well as bazel 3.1
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@ -41,6 +41,7 @@
 }:

 let
+  inherit (lib) lists strings trivial;
  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in

@ -54,6 +55,45 @@ assert !cudaSupport || magma.cudatoolkit == cudatoolkit;

 let
  setBool = v: if v then "1" else "0";
+
+  # https://github.com/pytorch/pytorch/blob/v1.13.1/torch/utils/cpp_extension.py#L1751
+  supportedTorchCudaCapabilities =
+    let
+      real = ["3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6"];
+      ptx = lists.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+
+  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
+  #   of the first list *from* the second list. That means:
+  #   lists.subtractLists a b = b - a
+
+  # For CUDA
+  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
+  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
+
+  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
+  gpuArchWarner = supported: unsupported:
+    trivial.throwIf (supported == [ ])
+      (
+        "No supported GPU targets specified. Requested GPU targets: "
+        + strings.concatStringsSep ", " unsupported
+      )
+      supported;
+
+  # Create the gpuTargetString.
+  gpuTargetString = strings.concatStringsSep ";" (
+    if gpuTargets != [ ] then
+    # If gpuTargets is specified, it always takes priority.
+      gpuTargets
+    else if cudaSupport then
+      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
+    else if rocmSupport then
+      hip.gpuTargets
+    else
+      throw "No GPU targets specified"
+  );
+
  cudatoolkit_joined = symlinkJoin {
    name = "${cudatoolkit.name}-unsplit";
    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
@ -146,14 +186,14 @@ in buildPythonPackage rec {
  '';

  preConfigure = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
+    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
  '' + lib.optionalString (cudaSupport && cudnn != null) ''
    export CUDNN_INCLUDE_DIR=${cudnn}/include
  '' + lib.optionalString rocmSupport ''
    export ROCM_PATH=${rocmtoolkit_joined}
    export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
-    export PYTORCH_ROCM_ARCH="${lib.strings.concatStringsSep ";" (if gpuTargets == [ ] then hip.gpuTargets else gpuTargets)}"
+    export PYTORCH_ROCM_ARCH="${gpuTargetString}"
    export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas"
    python tools/amd_build/build_amd.py
  '';
@ -320,7 +360,8 @@ in buildPythonPackage rec {
  requiredSystemFeatures = [ "big-parallel" ];

  passthru = {
-    inherit cudaSupport cudaPackages;
+    inherit cudaSupport cudaPackages gpuTargetString;
+    cudaCapabilities = supportedCudaCapabilities;
    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
    blasProvider = blas.provider;
  };
--- a/pkgs/development/python-modules/torchvision/default.nix
+++ b/pkgs/development/python-modules/torchvision/default.nix
@ -15,13 +15,13 @@
 }:

 let
-  inherit (torch.cudaPackages) cudatoolkit cudaFlags cudnn;
+  inherit (torch) gpuTargetString;
+  inherit (torch.cudaPackages) cudatoolkit cudnn;

  cudatoolkit_joined = symlinkJoin {
    name = "${cudatoolkit.name}-unsplit";
    paths = [ cudatoolkit.out cudatoolkit.lib ];
  };
-  cudaArchStr = lib.optionalString cudaSupport lib.strings.concatStringsSep ";" torch.cudaArchList;
 in buildPythonPackage rec {
  pname = "torchvision";
  version = "0.14.1";
@ -45,7 +45,7 @@ in buildPythonPackage rec {
  propagatedBuildInputs = [ numpy pillow torch scipy ];

  preBuild = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
+    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
    export FORCE_CUDA=1
  '';