From 501a1af970ca54cb300474a00aacfbd01f8a5b24 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Thu, 14 Dec 2023 20:12:20 +0000 Subject: [PATCH 1/6] cudaPackages.saxpy: now available pre-11.4 with CUDA Toolkit --- pkgs/top-level/cuda-packages.nix | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix index 9045b5754ab8..f997963ff468 100644 --- a/pkgs/top-level/cuda-packages.nix +++ b/pkgs/top-level/cuda-packages.nix @@ -72,10 +72,7 @@ let # Loose packages cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {}; - # SaxPy is only available after 11.4 because it requires redistributable versions of CUDA libraries. - saxpy = attrsets.optionalAttrs (strings.versionAtLeast cudaVersion "11.4") ( - final.callPackage ../development/cuda-modules/saxpy {} - ); + saxpy = final.callPackage ../development/cuda-modules/saxpy {}; } # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 From 9bebd9e72d6b552fcfd3d1e6716eca6563944f42 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Thu, 14 Dec 2023 22:19:02 +0000 Subject: [PATCH 2/6] tree-wide: cudaPackages should not break default eval cudaPackages: guard expressions against null values --- .../science/math/caffe/default.nix | 2 +- pkgs/development/cuda-modules/cudnn/shims.nix | 18 ++++-- .../cuda-modules/cutensor/extension.nix | 1 + pkgs/development/cuda-modules/flags.nix | 50 ++++++--------- .../generic-builders/manifest.nix | 62 ++++++++++++------- .../generic-builders/multiplex.nix | 18 ++---- .../development/cuda-modules/nccl/default.nix | 3 + .../cuda-modules/tensorrt/fixup.nix | 15 +++-- .../cuda-modules/tensorrt/shims.nix | 24 ++++--- .../libraries/science/math/magma/generic.nix | 2 +- .../development/libraries/xgboost/default.nix | 2 +- .../python-modules/jaxlib/default.nix | 3 +- .../python-modules/torch/default.nix | 13 ++-- pkgs/top-level/cuda-packages.nix | 4 -- 14 files changed, 119 insertions(+), 98 deletions(-) diff --git a/pkgs/applications/science/math/caffe/default.nix b/pkgs/applications/science/math/caffe/default.nix index 6595f0b846dd..25f7229a845a 100644 --- a/pkgs/applications/science/math/caffe/default.nix +++ b/pkgs/applications/science/math/caffe/default.nix @@ -153,7 +153,7 @@ stdenv.mkDerivation rec { || cudaSupport || !(leveldbSupport -> (leveldb != null && snappy != null)) || !(cudnnSupport -> (hasCudnn && cudaSupport)) - || !(ncclSupport -> cudaSupport) + || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported)) || !(pythonSupport -> (python != null && numpy != null)) ; license = licenses.bsd2; diff --git a/pkgs/development/cuda-modules/cudnn/shims.nix b/pkgs/development/cuda-modules/cudnn/shims.nix index e9eca8ef7c8b..a36ee26dab5d 100644 --- a/pkgs/development/cuda-modules/cudnn/shims.nix +++ b/pkgs/development/cuda-modules/cudnn/shims.nix @@ -1,10 +1,18 @@ # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix -{package, redistArch}: { - featureRelease.${redistArch}.outputs = { - lib = true; - static = true; - dev = true; + lib, + package, + # redistArch :: String + # String is "unsupported" if the given architecture is unsupported. + redistArch, +}: +{ + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { + ${redistArch}.outputs = { + lib = true; + static = true; + dev = true; + }; }; redistribRelease = { name = "NVIDIA CUDA Deep Neural Network library (cuDNN)"; diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix index b762fd22ede8..534941887c6e 100644 --- a/pkgs/development/cuda-modules/cutensor/extension.nix +++ b/pkgs/development/cuda-modules/cutensor/extension.nix @@ -92,6 +92,7 @@ let # A release is supported if it has a libPath that matches our CUDA version for our platform. # LibPath are not constant across the same release -- one platform may support fewer # CUDA versions than another. + # redistArch :: String redistArch = flags.getRedistArch hostPlatform.system; # platformIsSupported :: Manifests -> Boolean platformIsSupported = diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix index a123c7bce5a1..d5e01be01fd5 100644 --- a/pkgs/development/cuda-modules/flags.nix +++ b/pkgs/development/cuda-modules/flags.nix @@ -131,39 +131,29 @@ let # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices. # Since both are based on aarch64, we can only have one or the other, otherwise there's an # ambiguity as to which should be used. + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported + # systems gracefully. # getRedistArch :: String -> String - getRedistArch = - nixSystem: - if nixSystem == "aarch64-linux" then - if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa" - else if nixSystem == "x86_64-linux" then - "linux-x86_64" - else if nixSystem == "ppc64le-linux" then - "linux-ppc64le" - else if nixSystem == "x86_64-windows" then - "windows-x86_64" - else - "unsupported"; + getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" { + aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"; + x86_64-linux = "linux-x86_64"; + ppc64le-linux = "linux-ppc64le"; + x86_64-windows = "windows-x86_64"; + }; # Maps NVIDIA redist arch to Nix system. - # It is imperative that we include the boolean condition based on jetsonTargets to ensure - # we don't advertise availability of packages only available on server-grade ARM - # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are - # mapped to the Nix system `aarch64-linux`. - getNixSystem = - redistArch: - if redistArch == "linux-sbsa" && jetsonTargets == [] then - "aarch64-linux" - else if redistArch == "linux-aarch64" && jetsonTargets != [] then - "aarch64-linux" - else if redistArch == "linux-x86_64" then - "x86_64-linux" - else if redistArch == "linux-ppc64le" then - "ppc64le-linux" - else if redistArch == "windows-x86_64" then - "x86_64-windows" - else - "unsupported-${redistArch}"; + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported + # systems gracefully. + # getNixSystem :: String -> String + getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" { + linux-sbsa = "aarch64-linux"; + linux-aarch64 = "aarch64-linux"; + linux-x86_64 = "x86_64-linux"; + linux-ppc64le = "ppc64le-linux"; + windows-x86_64 = "x86_64-windows"; + }; formatCapabilities = { diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix index 5a4c5280d7db..64204346791a 100644 --- a/pkgs/development/cuda-modules/generic-builders/manifest.nix +++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix @@ -42,6 +42,9 @@ let # Get the redist architectures for which package provides distributables. # These are used by meta.platforms. supportedRedistArchs = builtins.attrNames featureRelease; + # redistArch :: String + # The redistArch is the name of the architecture for which the redistributable is built. + # It is `"unsupported"` if the redistributable is not supported on the target platform. redistArch = flags.getRedistArch hostPlatform.system; in backendStdenv.mkDerivation ( @@ -86,8 +89,18 @@ backendStdenv.mkDerivation ( "sample" "python" ]; + # Filter out outputs that don't exist in the redistributable. + # NOTE: In the case the redistributable isn't supported on the target platform, + # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which + # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`. + # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would + # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true -- + # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with + # `cudaSupport = false`! additionalOutputs = - if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs; + if redistArch == "unsupported" + then possibleOutputs + else builtins.filter hasOutput possibleOutputs; # The out output is special -- it's the default output and we always include it. outputs = [ "out" ] ++ additionalOutputs; in @@ -114,19 +127,28 @@ backendStdenv.mkDerivation ( # Useful for introspecting why something went wrong. # Maps descriptions of why the derivation would be marked broken to # booleans indicating whether that description is true. - brokenConditions = {}; - - src = fetchurl { - url = - if (builtins.hasAttr redistArch redistribRelease) then - "https://developer.download.nvidia.com/compute/${redistName}/redist/${ - redistribRelease.${redistArch}.relative_path - }" - else - "cannot-construct-an-url-for-the-${redistArch}-platform"; - sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash; + # brokenConditions :: AttrSet Bool + brokenConditions = { + # Using an unrecognized redistArch + "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported"; + # Trying to build for a platform that doesn't have a redistributable + "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null; }; + # src :: Optional Derivation + src = trivial.pipe redistArch [ + # If redistArch doesn't exist in redistribRelease, return null. + (redistArch: redistribRelease.${redistArch} or null) + # If the release is non-null, fetch the source; otherwise, return null. + (trivial.mapNullable ( + { relative_path, sha256, ... }: + fetchurl { + url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}"; + inherit sha256; + } + )) + ]; + postPatch = '' if [[ -d pkg-config ]] ; then mkdir -p share/pkg-config @@ -284,16 +306,12 @@ backendStdenv.mkDerivation ( meta = { description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}"; sourceProvenance = [sourceTypes.binaryNativeCode]; - platforms = - lists.concatMap - ( - redistArch: - let - nixSystem = flags.getNixSystem redistArch; - in - lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ] - ) - supportedRedistArchs; + platforms = trivial.pipe supportedRedistArchs [ + # Map each redist arch to the equivalent nix system or null if there is no equivalent. + (builtins.map flags.getNixSystem) + # Filter out unsupported systems + (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem))) + ]; broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions); license = licenses.unfree; maintainers = teams.cuda.members; diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix index 5480da730726..6353b07545a4 100644 --- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix +++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix @@ -20,7 +20,7 @@ # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names) # and to determine the outputs of the package. # shimFn :: {package, redistArch} -> AttrSet - shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"), + shimsFn ? (throw "shimsFn must be provided"), # fixupFn :: Path # A path (or nix expression) to be evaluated with callPackage and then # provided to the package's overrideAttrs function. @@ -29,16 +29,8 @@ # - cudaVersion # - mkVersionedPackageName # - package - fixupFn ? ( - { - final, - cudaVersion, - mkVersionedPackageName, - package, - ... - }: - throw "fixupFn must be provided" - ), + # - ... + fixupFn ? (throw "fixupFn must be provided"), }: let inherit (lib) @@ -80,9 +72,11 @@ let && strings.versionAtLeast package.maxCudaVersion cudaVersion; # Get all of the packages for our given platform. + # redistArch :: String + # Value is `"unsupported"` if the platform is not supported. redistArch = flags.getRedistArch hostPlatform.system; - allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets); + allReleases = lists.flatten (builtins.attrValues releaseSets); # All the supported packages we can build for our platform. # perSystemReleases :: List Package diff --git a/pkgs/development/cuda-modules/nccl/default.nix b/pkgs/development/cuda-modules/nccl/default.nix index c56d59cb4206..6e385688d0f8 100644 --- a/pkgs/development/cuda-modules/nccl/default.nix +++ b/pkgs/development/cuda-modules/nccl/default.nix @@ -100,6 +100,9 @@ backendStdenv.mkDerivation ( homepage = "https://developer.nvidia.com/nccl"; license = licenses.bsd3; platforms = platforms.linux; + # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. + # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 + badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ]; maintainers = with maintainers; [ diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix index 43a7dfb81784..51ca3d652bd1 100644 --- a/pkgs/development/cuda-modules/tensorrt/fixup.nix +++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix @@ -11,18 +11,17 @@ }: let inherit (lib) + attrsets maintainers meta strings versions ; - targetArch = - if hostPlatform.isx86_64 then - "x86_64-linux-gnu" - else if hostPlatform.isAarch64 then - "aarch64-linux-gnu" - else - "unsupported"; + # targetArch :: String + targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" { + x86_64-linux = "x86_64-linux-gnu"; + aarch64-linux = "aarch64-linux-gnu"; + }; in finalAttrs: prevAttrs: { # Useful for inspecting why something went wrong. @@ -69,7 +68,7 @@ finalAttrs: prevAttrs: { preInstall = (prevAttrs.preInstall or "") - + '' + + strings.optionalString (targetArch != "unsupported") '' # Replace symlinks to bin and lib with the actual directories from targets. for dir in bin lib; do rm "$dir" diff --git a/pkgs/development/cuda-modules/tensorrt/shims.nix b/pkgs/development/cuda-modules/tensorrt/shims.nix index 8be3e7988bb3..12465434ec85 100644 --- a/pkgs/development/cuda-modules/tensorrt/shims.nix +++ b/pkgs/development/cuda-modules/tensorrt/shims.nix @@ -1,13 +1,21 @@ # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix -{package, redistArch}: { - featureRelease.${redistArch}.outputs = { - bin = true; - lib = true; - static = true; - dev = true; - sample = true; - python = true; + lib, + package, + # redistArch :: String + # String is `"unsupported"` if the given architecture is unsupported. + redistArch, +}: +{ + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { + ${redistArch}.outputs = { + bin = true; + lib = true; + static = true; + dev = true; + sample = true; + python = true; + }; }; redistribRelease = { name = "TensorRT: a high-performance deep learning interface"; diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix index 1aaab46e1d1d..b27b42bf3ae8 100644 --- a/pkgs/development/libraries/science/math/magma/generic.nix +++ b/pkgs/development/libraries/science/math/magma/generic.nix @@ -159,7 +159,7 @@ stdenv.mkDerivation { description = "Matrix Algebra on GPU and Multicore Architectures"; license = licenses.bsd3; homepage = "http://icl.cs.utk.edu/magma/index.html"; - platforms = platforms.unix; + platforms = platforms.linux; maintainers = with maintainers; [ connorbaker ]; # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20 diff --git a/pkgs/development/libraries/xgboost/default.nix b/pkgs/development/libraries/xgboost/default.nix index 2a44ffc44382..0af51a40dfb1 100644 --- a/pkgs/development/libraries/xgboost/default.nix +++ b/pkgs/development/libraries/xgboost/default.nix @@ -14,7 +14,7 @@ , rPackages }@inputs: -assert ncclSupport -> cudaSupport; +assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported); # Disable regular tests when building the R package # because 1) the R package runs its own tests and # 2) the R package creates a different binary shared diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix index 27b9e61fbc82..d8dc4d67a594 100644 --- a/pkgs/development/python-modules/jaxlib/default.nix +++ b/pkgs/development/python-modules/jaxlib/default.nix @@ -64,7 +64,8 @@ let # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136 # however even with that fix applied, it doesn't work for everyone: # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129 - broken = stdenv.isDarwin; + # NOTE: We always build with NCCL; if it is unsupported, then our build is broken. + broken = stdenv.isDarwin || nccl.meta.unsupported; }; cudatoolkit_joined = symlinkJoin { diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix index 8fb227cbd36b..802d1a920141 100644 --- a/pkgs/development/python-modules/torch/default.nix +++ b/pkgs/development/python-modules/torch/default.nix @@ -7,7 +7,8 @@ magma, magma-hip, magma-cuda-static, - useSystemNccl ? true, + # Use the system NCCL as long as it is supported. + useSystemNccl ? !cudaPackages.nccl.meta.unsupported, MPISupport ? false, mpi, buildDocs ? false, @@ -57,6 +58,7 @@ let inherit (lib) attrsets lists strings trivial; inherit (cudaPackages) cudaFlags cudnn nccl; + ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported; setBool = v: if v then "1" else "0"; @@ -121,6 +123,7 @@ let "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]); "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit); "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages); + "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported; }; in buildPythonPackage rec { pname = "torch"; @@ -273,9 +276,9 @@ in buildPythonPackage rec { PYTORCH_BUILD_VERSION = version; PYTORCH_BUILD_NUMBER = 0; - USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl); - USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL - USE_STATIC_NCCL = setBool useSystemNccl; + USE_NCCL = setBool (cudaSupport && ncclSupported); + USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl); # don't build pytorch's third_party NCCL + USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl); # Suppress a weird warning in mkl-dnn, part of ideep in pytorch # (upstream seems to have fixed this in the wrong place?) @@ -363,7 +366,7 @@ in buildPythonPackage rec { ] ++ lists.optionals (cudaPackages ? cudnn) [ cudnn.dev cudnn.lib - ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [ + ] ++ lists.optionals (useSystemNccl && ncclSupported) [ # Some platforms do not support NCCL (i.e., Jetson) nccl.dev # Provides nccl.h AND a static copy of NCCL! ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix index f997963ff468..f20a36152203 100644 --- a/pkgs/top-level/cuda-packages.nix +++ b/pkgs/top-level/cuda-packages.nix @@ -73,10 +73,6 @@ let # Loose packages cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {}; saxpy = final.callPackage ../development/cuda-modules/saxpy {}; - } - # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. - # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 - // attrsets.optionalAttrs (!flags.isJetsonBuild) { nccl = final.callPackage ../development/cuda-modules/nccl {}; nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {}; } From 5e472d946836bb1be3e2ba30e01d746d5b597876 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Thu, 4 Jan 2024 13:48:10 +0000 Subject: [PATCH 3/6] cudaPackages: unsupported platform should not set broken to true --- .../cuda-modules/generic-builders/manifest.nix | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix index 64204346791a..f77a1a6385a9 100644 --- a/pkgs/development/cuda-modules/generic-builders/manifest.nix +++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix @@ -127,13 +127,10 @@ backendStdenv.mkDerivation ( # Useful for introspecting why something went wrong. # Maps descriptions of why the derivation would be marked broken to # booleans indicating whether that description is true. + # NOTE: This should not include reasons relating to the architecture, as those are handled by + # the `badPlatforms` attribute. # brokenConditions :: AttrSet Bool - brokenConditions = { - # Using an unrecognized redistArch - "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported"; - # Trying to build for a platform that doesn't have a redistributable - "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null; - }; + brokenConditions = { }; # src :: Optional Derivation src = trivial.pipe redistArch [ From 5c260fa5321120f660578674c6d41f54b4290f21 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Tue, 9 Jan 2024 15:33:13 +0000 Subject: [PATCH 4/6] cudaPackages: set badPlatforms when cudaSupport is false --- .../cuda-modules/cuda/overrides.nix | 2 +- .../generic-builders/manifest.nix | 26 ++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix index d92e07bb1b0b..e40d58736e8b 100644 --- a/pkgs/development/cuda-modules/cuda/overrides.nix +++ b/pkgs/development/cuda-modules/cuda/overrides.nix @@ -72,7 +72,7 @@ attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) { env.autoPatchelfIgnoreMissingDeps = prevAttrs.env.autoPatchelfIgnoreMissingDeps + " libnvrm_gpu.so libnvrm_mem.so libnvdla_runtime.so"; # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices. - brokenConditions = prevAttrs.brokenConditions // { + badPlatformsConditions = prevAttrs.badPlatformsConditions // { "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" = !final.flags.isJetsonBuild; }; diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix index f77a1a6385a9..dcc7c91d1e28 100644 --- a/pkgs/development/cuda-modules/generic-builders/manifest.nix +++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix @@ -4,6 +4,7 @@ autoAddCudaCompatRunpathHook, autoPatchelfHook, backendStdenv, + config, fetchurl, lib, lndir, @@ -124,14 +125,22 @@ backendStdenv.mkDerivation ( python = ["**/*.whl"]; }; - # Useful for introspecting why something went wrong. - # Maps descriptions of why the derivation would be marked broken to - # booleans indicating whether that description is true. - # NOTE: This should not include reasons relating to the architecture, as those are handled by - # the `badPlatforms` attribute. + # Useful for introspecting why something went wrong. Maps descriptions of why the derivation would be marked as + # broken on have badPlatforms include the current platform. + # brokenConditions :: AttrSet Bool + # Sets `meta.broken = true` if any of the conditions are true. + # Example: Broken on a specific version of CUDA or when a dependency has a specific version. brokenConditions = { }; + # badPlatformsConditions :: AttrSet Bool + # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true. + # Example: Broken on a specific architecture or when cudaSupport is false (building with CUDA essentially targets) + # a platform which NixOS doesn't have a notion of, otherwise we would specify the platform directly. + badPlatformsConditions = { + "CUDA support is disabled" = !config.cudaSupport; + }; + # src :: Optional Derivation src = trivial.pipe redistArch [ # If redistArch doesn't exist in redistribRelease, return null. @@ -303,13 +312,18 @@ backendStdenv.mkDerivation ( meta = { description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}"; sourceProvenance = [sourceTypes.binaryNativeCode]; + broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions); platforms = trivial.pipe supportedRedistArchs [ # Map each redist arch to the equivalent nix system or null if there is no equivalent. (builtins.map flags.getNixSystem) # Filter out unsupported systems (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem))) ]; - broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions); + badPlatforms = + let + isBadPlatform = lists.any trivial.id (attrsets.attrValues finalAttrs.badPlatformsConditions); + in + lists.optionals isBadPlatform finalAttrs.meta.platforms; license = licenses.unfree; maintainers = teams.cuda.members; # Force the use of the default, fat output by default (even though `dev` exists, which From 39cab2b768ef7a4b692f2d1b1516a7d44957e1c5 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Tue, 9 Jan 2024 22:31:43 +0000 Subject: [PATCH 5/6] python3Packages.torch: only build with NCCL when targeting CUDA on a supported platform --- .../development/python-modules/torch/default.nix | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix index 802d1a920141..8a499d763a4a 100644 --- a/pkgs/development/python-modules/torch/default.nix +++ b/pkgs/development/python-modules/torch/default.nix @@ -7,8 +7,8 @@ magma, magma-hip, magma-cuda-static, - # Use the system NCCL as long as it is supported. - useSystemNccl ? !cudaPackages.nccl.meta.unsupported, + # Use the system NCCL as long as we're targeting CUDA on a supported platform. + useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported), MPISupport ? false, mpi, buildDocs ? false, @@ -58,7 +58,6 @@ let inherit (lib) attrsets lists strings trivial; inherit (cudaPackages) cudaFlags cudnn nccl; - ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported; setBool = v: if v then "1" else "0"; @@ -123,7 +122,6 @@ let "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]); "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit); "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages); - "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported; }; in buildPythonPackage rec { pname = "torch"; @@ -276,9 +274,11 @@ in buildPythonPackage rec { PYTORCH_BUILD_VERSION = version; PYTORCH_BUILD_NUMBER = 0; - USE_NCCL = setBool (cudaSupport && ncclSupported); - USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl); # don't build pytorch's third_party NCCL - USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl); + # In-tree builds of NCCL are not supported. + # Use NCCL when cudaSupport is enabled and nccl is available. + USE_NCCL = setBool useSystemNccl; + USE_SYSTEM_NCCL = USE_NCCL; + USE_STATIC_NCCL = USE_NCCL; # Suppress a weird warning in mkl-dnn, part of ideep in pytorch # (upstream seems to have fixed this in the wrong place?) @@ -366,7 +366,7 @@ in buildPythonPackage rec { ] ++ lists.optionals (cudaPackages ? cudnn) [ cudnn.dev cudnn.lib - ] ++ lists.optionals (useSystemNccl && ncclSupported) [ + ] ++ lists.optionals useSystemNccl [ # Some platforms do not support NCCL (i.e., Jetson) nccl.dev # Provides nccl.h AND a static copy of NCCL! ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ From b2f97e14aee25445d324a53d3374cb3547242ba7 Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Tue, 9 Jan 2024 23:02:00 +0000 Subject: [PATCH 6/6] cudaPackages: default badPlatformsConditions to empty --- .../cuda-modules/generic-builders/manifest.nix | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix index dcc7c91d1e28..a823bb7d9ac7 100644 --- a/pkgs/development/cuda-modules/generic-builders/manifest.nix +++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix @@ -4,7 +4,6 @@ autoAddCudaCompatRunpathHook, autoPatchelfHook, backendStdenv, - config, fetchurl, lib, lndir, @@ -135,11 +134,8 @@ backendStdenv.mkDerivation ( # badPlatformsConditions :: AttrSet Bool # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true. - # Example: Broken on a specific architecture or when cudaSupport is false (building with CUDA essentially targets) - # a platform which NixOS doesn't have a notion of, otherwise we would specify the platform directly. - badPlatformsConditions = { - "CUDA support is disabled" = !config.cudaSupport; - }; + # Example: Broken on a specific architecture when some condition is met (like targeting Jetson). + badPlatformsConditions = { }; # src :: Optional Derivation src = trivial.pipe redistArch [