From 501a1af970ca54cb300474a00aacfbd01f8a5b24 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Thu, 14 Dec 2023 20:12:20 +0000
Subject: [PATCH 1/6] cudaPackages.saxpy: now available pre-11.4 with CUDA
 Toolkit

---
 pkgs/top-level/cuda-packages.nix | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index 9045b5754ab8..f997963ff468 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -72,10 +72,7 @@ let
 
         # Loose packages
         cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {};
-        # SaxPy is only available after 11.4 because it requires redistributable versions of CUDA libraries.
-        saxpy = attrsets.optionalAttrs (strings.versionAtLeast cudaVersion "11.4") (
-          final.callPackage ../development/cuda-modules/saxpy {}
-        );
+        saxpy = final.callPackage ../development/cuda-modules/saxpy {};
       }
       # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
       # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9

From 9bebd9e72d6b552fcfd3d1e6716eca6563944f42 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Thu, 14 Dec 2023 22:19:02 +0000
Subject: [PATCH 2/6] tree-wide: cudaPackages should not break default eval

cudaPackages: guard expressions against null values
---
 .../science/math/caffe/default.nix            |  2 +-
 pkgs/development/cuda-modules/cudnn/shims.nix | 18 ++++--
 .../cuda-modules/cutensor/extension.nix       |  1 +
 pkgs/development/cuda-modules/flags.nix       | 50 ++++++---------
 .../generic-builders/manifest.nix             | 62 ++++++++++++-------
 .../generic-builders/multiplex.nix            | 18 ++----
 .../development/cuda-modules/nccl/default.nix |  3 +
 .../cuda-modules/tensorrt/fixup.nix           | 15 +++--
 .../cuda-modules/tensorrt/shims.nix           | 24 ++++---
 .../libraries/science/math/magma/generic.nix  |  2 +-
 .../development/libraries/xgboost/default.nix |  2 +-
 .../python-modules/jaxlib/default.nix         |  3 +-
 .../python-modules/torch/default.nix          | 13 ++--
 pkgs/top-level/cuda-packages.nix              |  4 --
 14 files changed, 119 insertions(+), 98 deletions(-)

diff --git a/pkgs/applications/science/math/caffe/default.nix b/pkgs/applications/science/math/caffe/default.nix
index 6595f0b846dd..25f7229a845a 100644
--- a/pkgs/applications/science/math/caffe/default.nix
+++ b/pkgs/applications/science/math/caffe/default.nix
@@ -153,7 +153,7 @@ stdenv.mkDerivation rec {
       || cudaSupport
       || !(leveldbSupport -> (leveldb != null && snappy != null))
       || !(cudnnSupport -> (hasCudnn && cudaSupport))
-      || !(ncclSupport -> cudaSupport)
+      || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
       || !(pythonSupport -> (python != null && numpy != null))
     ;
     license = licenses.bsd2;
diff --git a/pkgs/development/cuda-modules/cudnn/shims.nix b/pkgs/development/cuda-modules/cudnn/shims.nix
index e9eca8ef7c8b..a36ee26dab5d 100644
--- a/pkgs/development/cuda-modules/cudnn/shims.nix
+++ b/pkgs/development/cuda-modules/cudnn/shims.nix
@@ -1,10 +1,18 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    lib = true;
-    static = true;
-    dev = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is "unsupported" if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      lib = true;
+      static = true;
+      dev = true;
+    };
   };
   redistribRelease = {
     name = "NVIDIA CUDA Deep Neural Network library (cuDNN)";
diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix
index b762fd22ede8..534941887c6e 100644
--- a/pkgs/development/cuda-modules/cutensor/extension.nix
+++ b/pkgs/development/cuda-modules/cutensor/extension.nix
@@ -92,6 +92,7 @@ let
   # A release is supported if it has a libPath that matches our CUDA version for our platform.
   # LibPath are not constant across the same release -- one platform may support fewer
   # CUDA versions than another.
+  # redistArch :: String
   redistArch = flags.getRedistArch hostPlatform.system;
   # platformIsSupported :: Manifests -> Boolean
   platformIsSupported =
diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index a123c7bce5a1..d5e01be01fd5 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -131,39 +131,29 @@ let
   # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices.
   # Since both are based on aarch64, we can only have one or the other, otherwise there's an
   # ambiguity as to which should be used.
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
   # getRedistArch :: String -> String
-  getRedistArch =
-    nixSystem:
-    if nixSystem == "aarch64-linux" then
-      if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"
-    else if nixSystem == "x86_64-linux" then
-      "linux-x86_64"
-    else if nixSystem == "ppc64le-linux" then
-      "linux-ppc64le"
-    else if nixSystem == "x86_64-windows" then
-      "windows-x86_64"
-    else
-      "unsupported";
+  getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" {
+    aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa";
+    x86_64-linux = "linux-x86_64";
+    ppc64le-linux = "linux-ppc64le";
+    x86_64-windows = "windows-x86_64";
+  };
 
   # Maps NVIDIA redist arch to Nix system.
-  # It is imperative that we include the boolean condition based on jetsonTargets to ensure
-  # we don't advertise availability of packages only available on server-grade ARM
-  # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are
-  # mapped to the Nix system `aarch64-linux`.
-  getNixSystem =
-    redistArch:
-    if redistArch == "linux-sbsa" && jetsonTargets == [] then
-      "aarch64-linux"
-    else if redistArch == "linux-aarch64" && jetsonTargets != [] then
-      "aarch64-linux"
-    else if redistArch == "linux-x86_64" then
-      "x86_64-linux"
-    else if redistArch == "linux-ppc64le" then
-      "ppc64le-linux"
-    else if redistArch == "windows-x86_64" then
-      "x86_64-windows"
-    else
-      "unsupported-${redistArch}";
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
+  # getNixSystem :: String -> String
+  getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" {
+    linux-sbsa = "aarch64-linux";
+    linux-aarch64 = "aarch64-linux";
+    linux-x86_64 = "x86_64-linux";
+    linux-ppc64le = "ppc64le-linux";
+    windows-x86_64 = "x86_64-windows";
+  };
 
   formatCapabilities =
     {
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 5a4c5280d7db..64204346791a 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -42,6 +42,9 @@ let
   # Get the redist architectures for which package provides distributables.
   # These are used by meta.platforms.
   supportedRedistArchs = builtins.attrNames featureRelease;
+  # redistArch :: String
+  # The redistArch is the name of the architecture for which the redistributable is built.
+  # It is `"unsupported"` if the redistributable is not supported on the target platform.
   redistArch = flags.getRedistArch hostPlatform.system;
 in
 backendStdenv.mkDerivation (
@@ -86,8 +89,18 @@ backendStdenv.mkDerivation (
           "sample"
           "python"
         ];
+        # Filter out outputs that don't exist in the redistributable.
+        # NOTE: In the case the redistributable isn't supported on the target platform,
+        # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which
+        # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`.
+        # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would
+        # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true --
+        # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with
+        # `cudaSupport = false`!
         additionalOutputs =
-          if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs;
+          if redistArch == "unsupported"
+          then possibleOutputs
+          else builtins.filter hasOutput possibleOutputs;
         # The out output is special -- it's the default output and we always include it.
         outputs = [ "out" ] ++ additionalOutputs;
       in
@@ -114,19 +127,28 @@ backendStdenv.mkDerivation (
     # Useful for introspecting why something went wrong.
     # Maps descriptions of why the derivation would be marked broken to
     # booleans indicating whether that description is true.
-    brokenConditions = {};
-
-    src = fetchurl {
-      url =
-        if (builtins.hasAttr redistArch redistribRelease) then
-          "https://developer.download.nvidia.com/compute/${redistName}/redist/${
-            redistribRelease.${redistArch}.relative_path
-          }"
-        else
-          "cannot-construct-an-url-for-the-${redistArch}-platform";
-      sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash;
+    # brokenConditions :: AttrSet Bool
+    brokenConditions = {
+      # Using an unrecognized redistArch
+      "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported";
+      # Trying to build for a platform that doesn't have a redistributable
+      "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null;
     };
 
+    # src :: Optional Derivation
+    src = trivial.pipe redistArch [
+      # If redistArch doesn't exist in redistribRelease, return null.
+      (redistArch: redistribRelease.${redistArch} or null)
+      # If the release is non-null, fetch the source; otherwise, return null.
+      (trivial.mapNullable (
+        { relative_path, sha256, ... }:
+        fetchurl {
+          url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
+          inherit sha256;
+        }
+      ))
+    ];
+
     postPatch = ''
       if [[ -d pkg-config ]] ; then
         mkdir -p share/pkg-config
@@ -284,16 +306,12 @@ backendStdenv.mkDerivation (
     meta = {
       description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}";
       sourceProvenance = [sourceTypes.binaryNativeCode];
-      platforms =
-        lists.concatMap
-          (
-            redistArch:
-            let
-              nixSystem = flags.getNixSystem redistArch;
-            in
-            lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ]
-          )
-          supportedRedistArchs;
+      platforms = trivial.pipe supportedRedistArchs [
+        # Map each redist arch to the equivalent nix system or null if there is no equivalent.
+        (builtins.map flags.getNixSystem)
+        # Filter out unsupported systems
+        (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem)))
+      ];
       broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
       license = licenses.unfree;
       maintainers = teams.cuda.members;
diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
index 5480da730726..6353b07545a4 100644
--- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix
+++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
@@ -20,7 +20,7 @@
   # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names)
   # and to determine the outputs of the package.
   # shimFn :: {package, redistArch} -> AttrSet
-  shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"),
+  shimsFn ? (throw "shimsFn must be provided"),
   # fixupFn :: Path
   # A path (or nix expression) to be evaluated with callPackage and then
   # provided to the package's overrideAttrs function.
@@ -29,16 +29,8 @@
   # - cudaVersion
   # - mkVersionedPackageName
   # - package
-  fixupFn ? (
-    {
-      final,
-      cudaVersion,
-      mkVersionedPackageName,
-      package,
-      ...
-    }:
-    throw "fixupFn must be provided"
-  ),
+  # - ...
+  fixupFn ? (throw "fixupFn must be provided"),
 }:
 let
   inherit (lib)
@@ -80,9 +72,11 @@ let
     && strings.versionAtLeast package.maxCudaVersion cudaVersion;
 
   # Get all of the packages for our given platform.
+  # redistArch :: String
+  # Value is `"unsupported"` if the platform is not supported.
   redistArch = flags.getRedistArch hostPlatform.system;
 
-  allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets);
+  allReleases = lists.flatten (builtins.attrValues releaseSets);
 
   # All the supported packages we can build for our platform.
   # perSystemReleases :: List Package
diff --git a/pkgs/development/cuda-modules/nccl/default.nix b/pkgs/development/cuda-modules/nccl/default.nix
index c56d59cb4206..6e385688d0f8 100644
--- a/pkgs/development/cuda-modules/nccl/default.nix
+++ b/pkgs/development/cuda-modules/nccl/default.nix
@@ -100,6 +100,9 @@ backendStdenv.mkDerivation (
       homepage = "https://developer.nvidia.com/nccl";
       license = licenses.bsd3;
       platforms = platforms.linux;
+      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
+      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
+      badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ];
       maintainers =
         with maintainers;
         [
diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix
index 43a7dfb81784..51ca3d652bd1 100644
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@@ -11,18 +11,17 @@
 }:
 let
   inherit (lib)
+    attrsets
     maintainers
     meta
     strings
     versions
     ;
-  targetArch =
-    if hostPlatform.isx86_64 then
-      "x86_64-linux-gnu"
-    else if hostPlatform.isAarch64 then
-      "aarch64-linux-gnu"
-    else
-      "unsupported";
+  # targetArch :: String
+  targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
+    x86_64-linux = "x86_64-linux-gnu";
+    aarch64-linux = "aarch64-linux-gnu";
+  };
 in
 finalAttrs: prevAttrs: {
   # Useful for inspecting why something went wrong.
@@ -69,7 +68,7 @@ finalAttrs: prevAttrs: {
 
   preInstall =
     (prevAttrs.preInstall or "")
-    + ''
+    + strings.optionalString (targetArch != "unsupported") ''
       # Replace symlinks to bin and lib with the actual directories from targets.
       for dir in bin lib; do
         rm "$dir"
diff --git a/pkgs/development/cuda-modules/tensorrt/shims.nix b/pkgs/development/cuda-modules/tensorrt/shims.nix
index 8be3e7988bb3..12465434ec85 100644
--- a/pkgs/development/cuda-modules/tensorrt/shims.nix
+++ b/pkgs/development/cuda-modules/tensorrt/shims.nix
@@ -1,13 +1,21 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    bin = true;
-    lib = true;
-    static = true;
-    dev = true;
-    sample = true;
-    python = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is `"unsupported"` if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      bin = true;
+      lib = true;
+      static = true;
+      dev = true;
+      sample = true;
+      python = true;
+    };
   };
   redistribRelease = {
     name = "TensorRT: a high-performance deep learning interface";
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index 1aaab46e1d1d..b27b42bf3ae8 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -159,7 +159,7 @@ stdenv.mkDerivation {
     description = "Matrix Algebra on GPU and Multicore Architectures";
     license = licenses.bsd3;
     homepage = "http://icl.cs.utk.edu/magma/index.html";
-    platforms = platforms.unix;
+    platforms = platforms.linux;
     maintainers = with maintainers; [ connorbaker ];
 
     # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20
diff --git a/pkgs/development/libraries/xgboost/default.nix b/pkgs/development/libraries/xgboost/default.nix
index 2a44ffc44382..0af51a40dfb1 100644
--- a/pkgs/development/libraries/xgboost/default.nix
+++ b/pkgs/development/libraries/xgboost/default.nix
@@ -14,7 +14,7 @@
 , rPackages
 }@inputs:
 
-assert ncclSupport -> cudaSupport;
+assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported);
 # Disable regular tests when building the R package
 # because 1) the R package runs its own tests and
 # 2) the R package creates a different binary shared
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index 27b9e61fbc82..d8dc4d67a594 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -64,7 +64,8 @@ let
     # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136
     # however even with that fix applied, it doesn't work for everyone:
     # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129
-    broken = stdenv.isDarwin;
+    # NOTE: We always build with NCCL; if it is unsupported, then our build is broken.
+    broken = stdenv.isDarwin || nccl.meta.unsupported;
   };
 
   cudatoolkit_joined = symlinkJoin {
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 8fb227cbd36b..802d1a920141 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -7,7 +7,8 @@
   magma,
   magma-hip,
   magma-cuda-static,
-  useSystemNccl ? true,
+  # Use the system NCCL as long as it is supported.
+  useSystemNccl ? !cudaPackages.nccl.meta.unsupported,
   MPISupport ? false, mpi,
   buildDocs ? false,
 
@@ -57,6 +58,7 @@
 let
   inherit (lib) attrsets lists strings trivial;
   inherit (cudaPackages) cudaFlags cudnn nccl;
+  ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported;
 
   setBool = v: if v then "1" else "0";
 
@@ -121,6 +123,7 @@ let
     "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]);
     "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
     "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages);
+    "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported;
   };
 in buildPythonPackage rec {
   pname = "torch";
@@ -273,9 +276,9 @@ in buildPythonPackage rec {
   PYTORCH_BUILD_VERSION = version;
   PYTORCH_BUILD_NUMBER = 0;
 
-  USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl);
-  USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
-  USE_STATIC_NCCL = setBool useSystemNccl;
+  USE_NCCL = setBool (cudaSupport && ncclSupported);
+  USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl);                  # don't build pytorch's third_party NCCL
+  USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl);
 
   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
   # (upstream seems to have fixed this in the wrong place?)
@@ -363,7 +366,7 @@ in buildPythonPackage rec {
     ] ++ lists.optionals (cudaPackages ? cudnn) [
       cudnn.dev
       cudnn.lib
-    ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [
+    ] ++ lists.optionals (useSystemNccl && ncclSupported) [
       # Some platforms do not support NCCL (i.e., Jetson)
       nccl.dev # Provides nccl.h AND a static copy of NCCL!
     ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index f997963ff468..f20a36152203 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -73,10 +73,6 @@ let
         # Loose packages
         cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {};
         saxpy = final.callPackage ../development/cuda-modules/saxpy {};
-      }
-      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
-      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
-      // attrsets.optionalAttrs (!flags.isJetsonBuild) {
         nccl = final.callPackage ../development/cuda-modules/nccl {};
         nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {};
       }

From 5e472d946836bb1be3e2ba30e01d746d5b597876 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Thu, 4 Jan 2024 13:48:10 +0000
Subject: [PATCH 3/6] cudaPackages: unsupported platform should not set broken
 to true

---
 .../cuda-modules/generic-builders/manifest.nix           | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 64204346791a..f77a1a6385a9 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -127,13 +127,10 @@ backendStdenv.mkDerivation (
     # Useful for introspecting why something went wrong.
     # Maps descriptions of why the derivation would be marked broken to
     # booleans indicating whether that description is true.
+    # NOTE: This should not include reasons relating to the architecture, as those are handled by
+    # the `badPlatforms` attribute.
     # brokenConditions :: AttrSet Bool
-    brokenConditions = {
-      # Using an unrecognized redistArch
-      "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported";
-      # Trying to build for a platform that doesn't have a redistributable
-      "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null;
-    };
+    brokenConditions = { };
 
     # src :: Optional Derivation
     src = trivial.pipe redistArch [

From 5c260fa5321120f660578674c6d41f54b4290f21 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 9 Jan 2024 15:33:13 +0000
Subject: [PATCH 4/6] cudaPackages: set badPlatforms when cudaSupport is false

---
 .../cuda-modules/cuda/overrides.nix           |  2 +-
 .../generic-builders/manifest.nix             | 26 ++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index d92e07bb1b0b..e40d58736e8b 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -72,7 +72,7 @@ attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) {
       env.autoPatchelfIgnoreMissingDeps =
         prevAttrs.env.autoPatchelfIgnoreMissingDeps + " libnvrm_gpu.so libnvrm_mem.so libnvdla_runtime.so";
       # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
-      brokenConditions = prevAttrs.brokenConditions // {
+      badPlatformsConditions = prevAttrs.badPlatformsConditions // {
         "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" =
           !final.flags.isJetsonBuild;
       };
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index f77a1a6385a9..dcc7c91d1e28 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -4,6 +4,7 @@
   autoAddCudaCompatRunpathHook,
   autoPatchelfHook,
   backendStdenv,
+  config,
   fetchurl,
   lib,
   lndir,
@@ -124,14 +125,22 @@ backendStdenv.mkDerivation (
       python = ["**/*.whl"];
     };
 
-    # Useful for introspecting why something went wrong.
-    # Maps descriptions of why the derivation would be marked broken to
-    # booleans indicating whether that description is true.
-    # NOTE: This should not include reasons relating to the architecture, as those are handled by
-    # the `badPlatforms` attribute.
+    # Useful for introspecting why something went wrong. Maps descriptions of why the derivation would be marked as
+    # broken on have badPlatforms include the current platform.
+
     # brokenConditions :: AttrSet Bool
+    # Sets `meta.broken = true` if any of the conditions are true.
+    # Example: Broken on a specific version of CUDA or when a dependency has a specific version.
     brokenConditions = { };
 
+    # badPlatformsConditions :: AttrSet Bool
+    # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true.
+    # Example: Broken on a specific architecture or when cudaSupport is false (building with CUDA essentially targets)
+    # a platform which NixOS doesn't have a notion of, otherwise we would specify the platform directly.
+    badPlatformsConditions = {
+      "CUDA support is disabled" = !config.cudaSupport;
+    };
+
     # src :: Optional Derivation
     src = trivial.pipe redistArch [
       # If redistArch doesn't exist in redistribRelease, return null.
@@ -303,13 +312,18 @@ backendStdenv.mkDerivation (
     meta = {
       description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}";
       sourceProvenance = [sourceTypes.binaryNativeCode];
+      broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
       platforms = trivial.pipe supportedRedistArchs [
         # Map each redist arch to the equivalent nix system or null if there is no equivalent.
         (builtins.map flags.getNixSystem)
         # Filter out unsupported systems
         (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem)))
       ];
-      broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
+      badPlatforms =
+        let
+          isBadPlatform = lists.any trivial.id (attrsets.attrValues finalAttrs.badPlatformsConditions);
+        in
+        lists.optionals isBadPlatform finalAttrs.meta.platforms;
       license = licenses.unfree;
       maintainers = teams.cuda.members;
       # Force the use of the default, fat output by default (even though `dev` exists, which

From 39cab2b768ef7a4b692f2d1b1516a7d44957e1c5 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 9 Jan 2024 22:31:43 +0000
Subject: [PATCH 5/6] python3Packages.torch: only build with NCCL when
 targeting CUDA on a supported platform

---
 .../development/python-modules/torch/default.nix | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 802d1a920141..8a499d763a4a 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -7,8 +7,8 @@
   magma,
   magma-hip,
   magma-cuda-static,
-  # Use the system NCCL as long as it is supported.
-  useSystemNccl ? !cudaPackages.nccl.meta.unsupported,
+  # Use the system NCCL as long as we're targeting CUDA on a supported platform.
+  useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported),
   MPISupport ? false, mpi,
   buildDocs ? false,
 
@@ -58,7 +58,6 @@
 let
   inherit (lib) attrsets lists strings trivial;
   inherit (cudaPackages) cudaFlags cudnn nccl;
-  ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported;
 
   setBool = v: if v then "1" else "0";
 
@@ -123,7 +122,6 @@ let
     "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]);
     "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
     "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages);
-    "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported;
   };
 in buildPythonPackage rec {
   pname = "torch";
@@ -276,9 +274,11 @@ in buildPythonPackage rec {
   PYTORCH_BUILD_VERSION = version;
   PYTORCH_BUILD_NUMBER = 0;
 
-  USE_NCCL = setBool (cudaSupport && ncclSupported);
-  USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl);                  # don't build pytorch's third_party NCCL
-  USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl);
+  # In-tree builds of NCCL are not supported.
+  # Use NCCL when cudaSupport is enabled and nccl is available.
+  USE_NCCL = setBool useSystemNccl;
+  USE_SYSTEM_NCCL = USE_NCCL;
+  USE_STATIC_NCCL = USE_NCCL;
 
   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
   # (upstream seems to have fixed this in the wrong place?)
@@ -366,7 +366,7 @@ in buildPythonPackage rec {
     ] ++ lists.optionals (cudaPackages ? cudnn) [
       cudnn.dev
       cudnn.lib
-    ] ++ lists.optionals (useSystemNccl && ncclSupported) [
+    ] ++ lists.optionals useSystemNccl [
       # Some platforms do not support NCCL (i.e., Jetson)
       nccl.dev # Provides nccl.h AND a static copy of NCCL!
     ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [

From b2f97e14aee25445d324a53d3374cb3547242ba7 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 9 Jan 2024 23:02:00 +0000
Subject: [PATCH 6/6] cudaPackages: default badPlatformsConditions to empty

---
 .../cuda-modules/generic-builders/manifest.nix            | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index dcc7c91d1e28..a823bb7d9ac7 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -4,7 +4,6 @@
   autoAddCudaCompatRunpathHook,
   autoPatchelfHook,
   backendStdenv,
-  config,
   fetchurl,
   lib,
   lndir,
@@ -135,11 +134,8 @@ backendStdenv.mkDerivation (
 
     # badPlatformsConditions :: AttrSet Bool
     # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true.
-    # Example: Broken on a specific architecture or when cudaSupport is false (building with CUDA essentially targets)
-    # a platform which NixOS doesn't have a notion of, otherwise we would specify the platform directly.
-    badPlatformsConditions = {
-      "CUDA support is disabled" = !config.cudaSupport;
-    };
+    # Example: Broken on a specific architecture when some condition is met (like targeting Jetson).
+    badPlatformsConditions = { };
 
     # src :: Optional Derivation
     src = trivial.pipe redistArch [