Merge pull request #274319 from ConnorBaker/feat/cudaPackages-all-packages-eval

tree-wide: cudaPackages attributes should not cause default eval to fail
This commit is contained in:
Someone 2024-01-11 04:32:59 +00:00 committed by GitHub
commit e529aea84f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 131 additions and 106 deletions

View file

@ -153,7 +153,7 @@ stdenv.mkDerivation rec {
|| cudaSupport
|| !(leveldbSupport -> (leveldb != null && snappy != null))
|| !(cudnnSupport -> (hasCudnn && cudaSupport))
|| !(ncclSupport -> cudaSupport)
|| !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
|| !(pythonSupport -> (python != null && numpy != null))
;
license = licenses.bsd2;

View file

@ -72,7 +72,7 @@ attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) {
env.autoPatchelfIgnoreMissingDeps =
prevAttrs.env.autoPatchelfIgnoreMissingDeps + " libnvrm_gpu.so libnvrm_mem.so libnvdla_runtime.so";
# `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
brokenConditions = prevAttrs.brokenConditions // {
badPlatformsConditions = prevAttrs.badPlatformsConditions // {
"Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" =
!final.flags.isJetsonBuild;
};

View file

@ -1,10 +1,18 @@
# Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
{package, redistArch}:
{
featureRelease.${redistArch}.outputs = {
lib = true;
static = true;
dev = true;
lib,
package,
# redistArch :: String
# String is "unsupported" if the given architecture is unsupported.
redistArch,
}:
{
featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
${redistArch}.outputs = {
lib = true;
static = true;
dev = true;
};
};
redistribRelease = {
name = "NVIDIA CUDA Deep Neural Network library (cuDNN)";

View file

@ -92,6 +92,7 @@ let
# A release is supported if it has a libPath that matches our CUDA version for our platform.
# LibPath are not constant across the same release -- one platform may support fewer
# CUDA versions than another.
# redistArch :: String
redistArch = flags.getRedistArch hostPlatform.system;
# platformIsSupported :: Manifests -> Boolean
platformIsSupported =

View file

@ -131,39 +131,29 @@ let
# `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices.
# Since both are based on aarch64, we can only have one or the other, otherwise there's an
# ambiguity as to which should be used.
# NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
# `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
# systems gracefully.
# getRedistArch :: String -> String
getRedistArch =
nixSystem:
if nixSystem == "aarch64-linux" then
if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"
else if nixSystem == "x86_64-linux" then
"linux-x86_64"
else if nixSystem == "ppc64le-linux" then
"linux-ppc64le"
else if nixSystem == "x86_64-windows" then
"windows-x86_64"
else
"unsupported";
getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" {
aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa";
x86_64-linux = "linux-x86_64";
ppc64le-linux = "linux-ppc64le";
x86_64-windows = "windows-x86_64";
};
# Maps NVIDIA redist arch to Nix system.
# It is imperative that we include the boolean condition based on jetsonTargets to ensure
# we don't advertise availability of packages only available on server-grade ARM
# as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are
# mapped to the Nix system `aarch64-linux`.
getNixSystem =
redistArch:
if redistArch == "linux-sbsa" && jetsonTargets == [] then
"aarch64-linux"
else if redistArch == "linux-aarch64" && jetsonTargets != [] then
"aarch64-linux"
else if redistArch == "linux-x86_64" then
"x86_64-linux"
else if redistArch == "linux-ppc64le" then
"ppc64le-linux"
else if redistArch == "windows-x86_64" then
"x86_64-windows"
else
"unsupported-${redistArch}";
# NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
# `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
# systems gracefully.
# getNixSystem :: String -> String
getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" {
linux-sbsa = "aarch64-linux";
linux-aarch64 = "aarch64-linux";
linux-x86_64 = "x86_64-linux";
linux-ppc64le = "ppc64le-linux";
windows-x86_64 = "x86_64-windows";
};
formatCapabilities =
{

View file

@ -43,6 +43,9 @@ let
# Get the redist architectures for which package provides distributables.
# These are used by meta.platforms.
supportedRedistArchs = builtins.attrNames featureRelease;
# redistArch :: String
# The redistArch is the name of the architecture for which the redistributable is built.
# It is `"unsupported"` if the redistributable is not supported on the target platform.
redistArch = flags.getRedistArch hostPlatform.system;
in
backendStdenv.mkDerivation (
@ -87,8 +90,18 @@ backendStdenv.mkDerivation (
"sample"
"python"
];
# Filter out outputs that don't exist in the redistributable.
# NOTE: In the case the redistributable isn't supported on the target platform,
# we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which
# aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`.
# The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would
# require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true --
# recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with
# `cudaSupport = false`!
additionalOutputs =
if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs;
if redistArch == "unsupported"
then possibleOutputs
else builtins.filter hasOutput possibleOutputs;
# The out output is special -- it's the default output and we always include it.
outputs = [ "out" ] ++ additionalOutputs;
in
@ -112,21 +125,32 @@ backendStdenv.mkDerivation (
python = ["**/*.whl"];
};
# Useful for introspecting why something went wrong.
# Maps descriptions of why the derivation would be marked broken to
# booleans indicating whether that description is true.
brokenConditions = {};
# Useful for introspecting why something went wrong. Maps descriptions of why the derivation would be marked as
# broken on have badPlatforms include the current platform.
src = fetchurl {
url =
if (builtins.hasAttr redistArch redistribRelease) then
"https://developer.download.nvidia.com/compute/${redistName}/redist/${
redistribRelease.${redistArch}.relative_path
}"
else
"cannot-construct-an-url-for-the-${redistArch}-platform";
sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash;
};
# brokenConditions :: AttrSet Bool
# Sets `meta.broken = true` if any of the conditions are true.
# Example: Broken on a specific version of CUDA or when a dependency has a specific version.
brokenConditions = { };
# badPlatformsConditions :: AttrSet Bool
# Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true.
# Example: Broken on a specific architecture when some condition is met (like targeting Jetson).
badPlatformsConditions = { };
# src :: Optional Derivation
src = trivial.pipe redistArch [
# If redistArch doesn't exist in redistribRelease, return null.
(redistArch: redistribRelease.${redistArch} or null)
# If the release is non-null, fetch the source; otherwise, return null.
(trivial.mapNullable (
{ relative_path, sha256, ... }:
fetchurl {
url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
inherit sha256;
}
))
];
# Handle the pkg-config files:
# 1. No FHS
@ -297,17 +321,18 @@ backendStdenv.mkDerivation (
meta = {
description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}";
sourceProvenance = [sourceTypes.binaryNativeCode];
platforms =
lists.concatMap
(
redistArch:
let
nixSystem = flags.getNixSystem redistArch;
in
lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ]
)
supportedRedistArchs;
broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
platforms = trivial.pipe supportedRedistArchs [
# Map each redist arch to the equivalent nix system or null if there is no equivalent.
(builtins.map flags.getNixSystem)
# Filter out unsupported systems
(builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem)))
];
badPlatforms =
let
isBadPlatform = lists.any trivial.id (attrsets.attrValues finalAttrs.badPlatformsConditions);
in
lists.optionals isBadPlatform finalAttrs.meta.platforms;
license = licenses.unfree;
maintainers = teams.cuda.members;
# Force the use of the default, fat output by default (even though `dev` exists, which

View file

@ -20,7 +20,7 @@
# The featureRelease is used to populate meta.platforms (by way of looking at the attribute names)
# and to determine the outputs of the package.
# shimFn :: {package, redistArch} -> AttrSet
shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"),
shimsFn ? (throw "shimsFn must be provided"),
# fixupFn :: Path
# A path (or nix expression) to be evaluated with callPackage and then
# provided to the package's overrideAttrs function.
@ -29,16 +29,8 @@
# - cudaVersion
# - mkVersionedPackageName
# - package
fixupFn ? (
{
final,
cudaVersion,
mkVersionedPackageName,
package,
...
}:
throw "fixupFn must be provided"
),
# - ...
fixupFn ? (throw "fixupFn must be provided"),
}:
let
inherit (lib)
@ -80,9 +72,11 @@ let
&& strings.versionAtLeast package.maxCudaVersion cudaVersion;
# Get all of the packages for our given platform.
# redistArch :: String
# Value is `"unsupported"` if the platform is not supported.
redistArch = flags.getRedistArch hostPlatform.system;
allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets);
allReleases = lists.flatten (builtins.attrValues releaseSets);
# All the supported packages we can build for our platform.
# perSystemReleases :: List Package

View file

@ -100,6 +100,9 @@ backendStdenv.mkDerivation (
homepage = "https://developer.nvidia.com/nccl";
license = licenses.bsd3;
platforms = platforms.linux;
# NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
# https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ];
maintainers =
with maintainers;
[

View file

@ -11,18 +11,17 @@
}:
let
inherit (lib)
attrsets
maintainers
meta
strings
versions
;
targetArch =
if hostPlatform.isx86_64 then
"x86_64-linux-gnu"
else if hostPlatform.isAarch64 then
"aarch64-linux-gnu"
else
"unsupported";
# targetArch :: String
targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
x86_64-linux = "x86_64-linux-gnu";
aarch64-linux = "aarch64-linux-gnu";
};
in
finalAttrs: prevAttrs: {
# Useful for inspecting why something went wrong.
@ -69,7 +68,7 @@ finalAttrs: prevAttrs: {
preInstall =
(prevAttrs.preInstall or "")
+ ''
+ strings.optionalString (targetArch != "unsupported") ''
# Replace symlinks to bin and lib with the actual directories from targets.
for dir in bin lib; do
rm "$dir"

View file

@ -1,13 +1,21 @@
# Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
{package, redistArch}:
{
featureRelease.${redistArch}.outputs = {
bin = true;
lib = true;
static = true;
dev = true;
sample = true;
python = true;
lib,
package,
# redistArch :: String
# String is `"unsupported"` if the given architecture is unsupported.
redistArch,
}:
{
featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
${redistArch}.outputs = {
bin = true;
lib = true;
static = true;
dev = true;
sample = true;
python = true;
};
};
redistribRelease = {
name = "TensorRT: a high-performance deep learning interface";

View file

@ -159,7 +159,7 @@ stdenv.mkDerivation {
description = "Matrix Algebra on GPU and Multicore Architectures";
license = licenses.bsd3;
homepage = "http://icl.cs.utk.edu/magma/index.html";
platforms = platforms.unix;
platforms = platforms.linux;
maintainers = with maintainers; [ connorbaker ];
# Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20

View file

@ -14,7 +14,7 @@
, rPackages
}@inputs:
assert ncclSupport -> cudaSupport;
assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported);
# Disable regular tests when building the R package
# because 1) the R package runs its own tests and
# 2) the R package creates a different binary shared

View file

@ -64,7 +64,8 @@ let
# aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136
# however even with that fix applied, it doesn't work for everyone:
# https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129
broken = stdenv.isDarwin;
# NOTE: We always build with NCCL; if it is unsupported, then our build is broken.
broken = stdenv.isDarwin || nccl.meta.unsupported;
};
cudatoolkit_joined = symlinkJoin {

View file

@ -7,7 +7,8 @@
magma,
magma-hip,
magma-cuda-static,
useSystemNccl ? true,
# Use the system NCCL as long as we're targeting CUDA on a supported platform.
useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported),
MPISupport ? false, mpi,
buildDocs ? false,
@ -273,9 +274,11 @@ in buildPythonPackage rec {
PYTORCH_BUILD_VERSION = version;
PYTORCH_BUILD_NUMBER = 0;
USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl);
USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL
USE_STATIC_NCCL = setBool useSystemNccl;
# In-tree builds of NCCL are not supported.
# Use NCCL when cudaSupport is enabled and nccl is available.
USE_NCCL = setBool useSystemNccl;
USE_SYSTEM_NCCL = USE_NCCL;
USE_STATIC_NCCL = USE_NCCL;
# Suppress a weird warning in mkl-dnn, part of ideep in pytorch
# (upstream seems to have fixed this in the wrong place?)
@ -363,7 +366,7 @@ in buildPythonPackage rec {
] ++ lists.optionals (cudaPackages ? cudnn) [
cudnn.dev
cudnn.lib
] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [
] ++ lists.optionals useSystemNccl [
# Some platforms do not support NCCL (i.e., Jetson)
nccl.dev # Provides nccl.h AND a static copy of NCCL!
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [

View file

@ -72,14 +72,7 @@ let
# Loose packages
cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {};
# SaxPy is only available after 11.4 because it requires redistributable versions of CUDA libraries.
saxpy = attrsets.optionalAttrs (strings.versionAtLeast cudaVersion "11.4") (
final.callPackage ../development/cuda-modules/saxpy {}
);
}
# NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
# https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
// attrsets.optionalAttrs (!flags.isJetsonBuild) {
saxpy = final.callPackage ../development/cuda-modules/saxpy {};
nccl = final.callPackage ../development/cuda-modules/nccl {};
nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {};
}