Merge pull request #235024 from deshaw/upstream-dcgm
Add NVIDIA DCGM and DCGM-exporter (prometheus)
This commit is contained in:
commit
931999d13b
8 changed files with 277 additions and 11 deletions
|
@ -138,7 +138,7 @@ backendStdenv.mkDerivation rec {
|
||||||
(ucx.override { enableCuda = false; }) # Avoid infinite recursion
|
(ucx.override { enableCuda = false; }) # Avoid infinite recursion
|
||||||
xorg.libxshmfence
|
xorg.libxshmfence
|
||||||
xorg.libxkbfile
|
xorg.libxkbfile
|
||||||
] ++ (lib.optionals (lib.versionAtLeast version "12.1") (map lib.getLib ([
|
] ++ (lib.optionals (lib.versionAtLeast version "12") (map lib.getLib ([
|
||||||
# Used by `/target-linux-x64/CollectX/clx` and `/target-linux-x64/CollectX/libclx_api.so` for:
|
# Used by `/target-linux-x64/CollectX/clx` and `/target-linux-x64/CollectX/libclx_api.so` for:
|
||||||
# - `libcurl.so.4`
|
# - `libcurl.so.4`
|
||||||
curlMinimal
|
curlMinimal
|
||||||
|
@ -183,7 +183,9 @@ backendStdenv.mkDerivation rec {
|
||||||
"libcom_err.so.2"
|
"libcom_err.so.2"
|
||||||
];
|
];
|
||||||
|
|
||||||
preFixup = ''
|
preFixup = if lib.versionOlder version "11" then ''
|
||||||
|
patchelf $out/targets/*/lib/libnvrtc.so --add-needed libnvrtc-builtins.so
|
||||||
|
'' else ''
|
||||||
patchelf $out/lib64/libnvrtc.so --add-needed libnvrtc-builtins.so
|
patchelf $out/lib64/libnvrtc.so --add-needed libnvrtc-builtins.so
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
|
|
@ -40,19 +40,13 @@ stdenv.mkDerivation rec {
|
||||||
"-DBUILD_SHARED_LIBS=ON"
|
"-DBUILD_SHARED_LIBS=ON"
|
||||||
"-DBUILD_OBJECT_LIBS=OFF"
|
"-DBUILD_OBJECT_LIBS=OFF"
|
||||||
"-DJSONCPP_WITH_CMAKE_PACKAGE=ON"
|
"-DJSONCPP_WITH_CMAKE_PACKAGE=ON"
|
||||||
|
"-DBUILD_STATIC_LIBS=${if enableStatic then "ON" else "OFF"}"
|
||||||
]
|
]
|
||||||
# the test's won't compile if secureMemory is used because there is no
|
# the test's won't compile if secureMemory is used because there is no
|
||||||
# comparison operators and conversion functions between
|
# comparison operators and conversion functions between
|
||||||
# std::basic_string<..., Json::SecureAllocator<char>> vs.
|
# std::basic_string<..., Json::SecureAllocator<char>> vs.
|
||||||
# std::basic_string<..., [default allocator]>
|
# std::basic_string<..., [default allocator]>
|
||||||
++ lib.optional ((stdenv.buildPlatform != stdenv.hostPlatform) || secureMemory) "-DJSONCPP_WITH_TESTS=OFF"
|
++ lib.optional ((stdenv.buildPlatform != stdenv.hostPlatform) || secureMemory) "-DJSONCPP_WITH_TESTS=OFF";
|
||||||
++ lib.optional (!enableStatic) "-DBUILD_STATIC_LIBS=OFF";
|
|
||||||
|
|
||||||
# this is fixed and no longer necessary in 1.9.5 but there they use
|
|
||||||
# memset_s without switching to a different c++ standard in the cmake files
|
|
||||||
postInstall = lib.optionalString enableStatic ''
|
|
||||||
(cd $out/lib && ln -sf libjsoncpp_static.a libjsoncpp.a)
|
|
||||||
'';
|
|
||||||
|
|
||||||
meta = with lib; {
|
meta = with lib; {
|
||||||
homepage = "https://github.com/open-source-parsers/jsoncpp";
|
homepage = "https://github.com/open-source-parsers/jsoncpp";
|
||||||
|
|
|
@ -20,6 +20,8 @@ stdenv.mkDerivation rec {
|
||||||
})
|
})
|
||||||
];
|
];
|
||||||
|
|
||||||
|
configureFlags = lib.optional (!sslSupport) "--disable-openssl";
|
||||||
|
|
||||||
preConfigure = lib.optionalString (lib.versionAtLeast stdenv.hostPlatform.darwinMinVersion "11") ''
|
preConfigure = lib.optionalString (lib.versionAtLeast stdenv.hostPlatform.darwinMinVersion "11") ''
|
||||||
MACOSX_DEPLOYMENT_TARGET=10.16
|
MACOSX_DEPLOYMENT_TARGET=10.16
|
||||||
'';
|
'';
|
||||||
|
|
48
pkgs/development/libraries/tclap/1.4.nix
Normal file
48
pkgs/development/libraries/tclap/1.4.nix
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
{ lib
|
||||||
|
, stdenv
|
||||||
|
, fetchgit
|
||||||
|
, cmake
|
||||||
|
, doxygen
|
||||||
|
, python3
|
||||||
|
}:
|
||||||
|
stdenv.mkDerivation {
|
||||||
|
pname = "tclap";
|
||||||
|
|
||||||
|
# This version is slightly newer than 1.4.0-rc1:
|
||||||
|
# See https://github.com/mirror/tclap/compare/1.4.0-rc1..3feeb7b2499b37d9cb80890cadaf7c905a9a50c6
|
||||||
|
version = "1.4-3feeb7b";
|
||||||
|
|
||||||
|
src = fetchgit {
|
||||||
|
url = "git://git.code.sf.net/p/tclap/code";
|
||||||
|
rev = "3feeb7b2499b37d9cb80890cadaf7c905a9a50c6"; # 1.4 branch
|
||||||
|
hash = "sha256-byLianB6Vf+I9ABMmsmuoGU2o5RO9c5sMckWW0F+GDM=";
|
||||||
|
};
|
||||||
|
|
||||||
|
postPatch = ''
|
||||||
|
substituteInPlace CMakeLists.txt \
|
||||||
|
--replace '$'{CMAKE_INSTALL_LIBDIR_ARCHIND} '$'{CMAKE_INSTALL_LIBDIR}
|
||||||
|
substituteInPlace packaging/pkgconfig.pc.in \
|
||||||
|
--replace '$'{prefix}/@CMAKE_INSTALL_INCLUDEDIR@ @CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||||
|
'';
|
||||||
|
|
||||||
|
nativeBuildInputs = [
|
||||||
|
cmake
|
||||||
|
doxygen
|
||||||
|
python3
|
||||||
|
];
|
||||||
|
|
||||||
|
# Installing docs is broken in this package+version so we stub out some files
|
||||||
|
preInstall = ''
|
||||||
|
touch docs/manual.html
|
||||||
|
'';
|
||||||
|
|
||||||
|
doCheck = true;
|
||||||
|
|
||||||
|
meta = with lib; {
|
||||||
|
description = "Templatized C++ Command Line Parser Library (v1.4)";
|
||||||
|
homepage = "https://tclap.sourceforge.net/";
|
||||||
|
license = licenses.mit;
|
||||||
|
maintainers = teams.deshaw.members;
|
||||||
|
platforms = platforms.all;
|
||||||
|
};
|
||||||
|
}
|
147
pkgs/os-specific/linux/dcgm/default.nix
Normal file
147
pkgs/os-specific/linux/dcgm/default.nix
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
{ lib
|
||||||
|
, callPackage
|
||||||
|
, gcc11Stdenv
|
||||||
|
, fetchFromGitHub
|
||||||
|
, addOpenGLRunpath
|
||||||
|
, catch2
|
||||||
|
, cmake
|
||||||
|
, cudaPackages_10_2
|
||||||
|
, cudaPackages_11_8
|
||||||
|
, cudaPackages_12
|
||||||
|
, fmt_9
|
||||||
|
, git
|
||||||
|
, jsoncpp
|
||||||
|
, libevent
|
||||||
|
, plog
|
||||||
|
, python3
|
||||||
|
, symlinkJoin
|
||||||
|
, tclap_1_4
|
||||||
|
, yaml-cpp
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
# Flags copied from DCGM's libevent build script
|
||||||
|
libevent-nossl = libevent.override { sslSupport = false; };
|
||||||
|
libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
|
||||||
|
CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
|
||||||
|
CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
|
||||||
|
configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
|
||||||
|
});
|
||||||
|
|
||||||
|
jsoncpp-static = jsoncpp.override { enableStatic = true; };
|
||||||
|
|
||||||
|
# DCGM depends on 3 different versions of CUDA at the same time.
|
||||||
|
# The runtime closure, thankfully, is quite small because most things
|
||||||
|
# are statically linked.
|
||||||
|
cudaPackageSetByVersion = [
|
||||||
|
{
|
||||||
|
version = "10";
|
||||||
|
# Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
|
||||||
|
pkgSet = [
|
||||||
|
cudaPackages_10_2.cudatoolkit
|
||||||
|
cudaPackages_10_2.cudatoolkit.lib
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
version = "11";
|
||||||
|
pkgSet = getCudaPackages cudaPackages_11_8;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
version = "12";
|
||||||
|
pkgSet = getCudaPackages cudaPackages_12;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
# Select needed redist packages from cudaPackages
|
||||||
|
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
|
||||||
|
getCudaPackages = p: with p; [
|
||||||
|
cuda_cccl
|
||||||
|
cuda_cudart
|
||||||
|
cuda_nvcc
|
||||||
|
cuda_nvml_dev
|
||||||
|
libcublas
|
||||||
|
libcufft
|
||||||
|
libcurand
|
||||||
|
];
|
||||||
|
|
||||||
|
# Builds CMake code to add CUDA paths for include and lib.
|
||||||
|
mkAppendCudaPaths = { version, pkgSet }:
|
||||||
|
let
|
||||||
|
# The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
|
||||||
|
# combine everything together for headers to work.
|
||||||
|
# It would be more convenient to use symlinkJoin on *just* the include subdirectories
|
||||||
|
# of each package, but not all of them have an include directory and making that work
|
||||||
|
# is more effort than it's worth for this temporary, build-time package.
|
||||||
|
combined = symlinkJoin {
|
||||||
|
name = "cuda-combined-${version}";
|
||||||
|
paths = pkgSet;
|
||||||
|
};
|
||||||
|
# The combined package above breaks the build for some reason so we just configure
|
||||||
|
# each package's library path.
|
||||||
|
libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
|
||||||
|
in ''
|
||||||
|
list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
|
||||||
|
list(APPEND Cuda${version}_LIB_PATHS ${libs})
|
||||||
|
'';
|
||||||
|
|
||||||
|
# gcc11 is required by DCGM's very particular build system
|
||||||
|
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
|
||||||
|
in gcc11Stdenv.mkDerivation rec {
|
||||||
|
pname = "dcgm";
|
||||||
|
version = "3.1.8";
|
||||||
|
|
||||||
|
src = fetchFromGitHub {
|
||||||
|
owner = "NVIDIA";
|
||||||
|
repo = "DCGM";
|
||||||
|
rev = "refs/tags/v${version}";
|
||||||
|
hash = "sha256-OXqXkP2ZUNPzafGIgJ0MKa39xB84keVFFYl+JsHgnks=";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Add our paths to the CUDA paths so FindCuda.cmake can find them.
|
||||||
|
EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
|
||||||
|
prePatch = ''
|
||||||
|
echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
|
||||||
|
'';
|
||||||
|
|
||||||
|
hardeningDisable = [ "all" ];
|
||||||
|
|
||||||
|
nativeBuildInputs = [
|
||||||
|
addOpenGLRunpath
|
||||||
|
cmake
|
||||||
|
git
|
||||||
|
python3
|
||||||
|
|
||||||
|
jsoncpp-static
|
||||||
|
jsoncpp-static.dev
|
||||||
|
libevent-nossl-static
|
||||||
|
libevent-nossl-static.dev
|
||||||
|
plog.dev # header-only
|
||||||
|
tclap_1_4 # header-only
|
||||||
|
];
|
||||||
|
|
||||||
|
buildInputs = [
|
||||||
|
catch2
|
||||||
|
fmt_9
|
||||||
|
yaml-cpp
|
||||||
|
];
|
||||||
|
|
||||||
|
# libcuda.so must be found at runtime because it is supplied by the NVIDIA
|
||||||
|
# driver. autoAddOpenGLRunpathHook breaks on the statically linked exes.
|
||||||
|
postFixup = ''
|
||||||
|
find "$out/bin" "$out/lib" -type f -executable -print0 | while IFS= read -r -d "" f; do
|
||||||
|
if isELF "$f" && [[ $(patchelf --print-needed "$f" || true) == *libcuda.so* ]]; then
|
||||||
|
addOpenGLRunpath "$f"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
|
||||||
|
disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
|
||||||
|
|
||||||
|
meta = with lib; {
|
||||||
|
description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs.";
|
||||||
|
homepage = "https://developer.nvidia.com/dcgm";
|
||||||
|
license = licenses.asl20;
|
||||||
|
maintainers = teams.deshaw.members;
|
||||||
|
mainProgram = "dcgmi";
|
||||||
|
platforms = platforms.linux;
|
||||||
|
};
|
||||||
|
}
|
66
pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix
Normal file
66
pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
{ lib
|
||||||
|
, buildGoModule
|
||||||
|
, fetchFromGitHub
|
||||||
|
, cudaPackages
|
||||||
|
, dcgm
|
||||||
|
, linuxPackages
|
||||||
|
}:
|
||||||
|
buildGoModule rec {
|
||||||
|
pname = "dcgm-exporter";
|
||||||
|
version = "3.1.8-3.1.5";
|
||||||
|
|
||||||
|
src = fetchFromGitHub {
|
||||||
|
owner = "NVIDIA";
|
||||||
|
repo = pname;
|
||||||
|
rev = "refs/tags/${version}";
|
||||||
|
hash = "sha256-Jzv3cU3gmGIXV+DV3wV/1zSWwz18s3Jax6JC7WZW7Z4=";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Upgrade to go 1.17 during the vendoring FOD build because it fails otherwise.
|
||||||
|
overrideModAttrs = _: {
|
||||||
|
preBuild = ''
|
||||||
|
substituteInPlace go.mod --replace 'go 1.16' 'go 1.17'
|
||||||
|
go mod tidy
|
||||||
|
'';
|
||||||
|
postInstall = ''
|
||||||
|
cp go.mod "$out/go.mod"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
CGO_LDFLAGS = "-ldcgm";
|
||||||
|
|
||||||
|
buildInputs = [
|
||||||
|
dcgm
|
||||||
|
];
|
||||||
|
|
||||||
|
# gonvml and go-dcgm do not work with ELF BIND_NOW hardening because not all
|
||||||
|
# symbols are available on startup.
|
||||||
|
hardeningDisable = [ "bindnow" ];
|
||||||
|
|
||||||
|
# Copy the modified go.mod we got from the vendoring process.
|
||||||
|
preBuild = ''
|
||||||
|
cp vendor/go.mod go.mod
|
||||||
|
'';
|
||||||
|
|
||||||
|
vendorHash = "sha256-KMCV79kUY1sNYysH0MmB7pVU98r7v+DpLIoYHxyyG4U=";
|
||||||
|
|
||||||
|
nativeBuildInputs = [
|
||||||
|
cudaPackages.autoAddOpenGLRunpathHook
|
||||||
|
];
|
||||||
|
|
||||||
|
# Tests try to interact with running DCGM service.
|
||||||
|
doCheck = false;
|
||||||
|
|
||||||
|
postFixup = ''
|
||||||
|
patchelf --add-needed libnvidia-ml.so "$out/bin/dcgm-exporter"
|
||||||
|
'';
|
||||||
|
|
||||||
|
meta = with lib; {
|
||||||
|
description = "NVIDIA GPU metrics exporter for Prometheus leveraging DCGM";
|
||||||
|
homepage = "https://github.com/NVIDIA/dcgm-exporter";
|
||||||
|
license = licenses.asl20;
|
||||||
|
maintainers = teams.deshaw.members;
|
||||||
|
mainProgram = "dcgm-exporter";
|
||||||
|
platforms = platforms.linux;
|
||||||
|
};
|
||||||
|
}
|
|
@ -555,6 +555,8 @@ with pkgs;
|
||||||
|
|
||||||
dbip-country-lite = callPackage ../data/misc/dbip-country-lite { };
|
dbip-country-lite = callPackage ../data/misc/dbip-country-lite { };
|
||||||
|
|
||||||
|
dcgm = callPackage ../os-specific/linux/dcgm { };
|
||||||
|
|
||||||
dhallDirectoryToNix = callPackage ../build-support/dhall/directory-to-nix.nix { };
|
dhallDirectoryToNix = callPackage ../build-support/dhall/directory-to-nix.nix { };
|
||||||
|
|
||||||
dhallPackageToNix = callPackage ../build-support/dhall/package-to-nix.nix { };
|
dhallPackageToNix = callPackage ../build-support/dhall/package-to-nix.nix { };
|
||||||
|
@ -25017,7 +25019,11 @@ with pkgs;
|
||||||
|
|
||||||
taskflow = callPackage ../development/libraries/taskflow { };
|
taskflow = callPackage ../development/libraries/taskflow { };
|
||||||
|
|
||||||
tclap = callPackage ../development/libraries/tclap { };
|
tclap = tclap_1_2;
|
||||||
|
|
||||||
|
tclap_1_2 = callPackage ../development/libraries/tclap/1.2.nix { };
|
||||||
|
|
||||||
|
tclap_1_4 = callPackage ../development/libraries/tclap/1.4.nix { };
|
||||||
|
|
||||||
tcllib = callPackage ../development/libraries/tcllib { };
|
tcllib = callPackage ../development/libraries/tcllib { };
|
||||||
|
|
||||||
|
@ -26847,6 +26853,7 @@ with pkgs;
|
||||||
prometheus-cloudflare-exporter = callPackage ../servers/monitoring/prometheus/cloudflare-exporter.nix { };
|
prometheus-cloudflare-exporter = callPackage ../servers/monitoring/prometheus/cloudflare-exporter.nix { };
|
||||||
prometheus-collectd-exporter = callPackage ../servers/monitoring/prometheus/collectd-exporter.nix { };
|
prometheus-collectd-exporter = callPackage ../servers/monitoring/prometheus/collectd-exporter.nix { };
|
||||||
prometheus-consul-exporter = callPackage ../servers/monitoring/prometheus/consul-exporter.nix { };
|
prometheus-consul-exporter = callPackage ../servers/monitoring/prometheus/consul-exporter.nix { };
|
||||||
|
prometheus-dcgm-exporter = callPackage ../servers/monitoring/prometheus/dcgm-exporter { };
|
||||||
prometheus-dnsmasq-exporter = callPackage ../servers/monitoring/prometheus/dnsmasq-exporter.nix { };
|
prometheus-dnsmasq-exporter = callPackage ../servers/monitoring/prometheus/dnsmasq-exporter.nix { };
|
||||||
prometheus-dovecot-exporter = callPackage ../servers/monitoring/prometheus/dovecot-exporter.nix { };
|
prometheus-dovecot-exporter = callPackage ../servers/monitoring/prometheus/dovecot-exporter.nix { };
|
||||||
prometheus-domain-exporter = callPackage ../servers/monitoring/prometheus/domain-exporter.nix { };
|
prometheus-domain-exporter = callPackage ../servers/monitoring/prometheus/domain-exporter.nix { };
|
||||||
|
|
Loading…
Reference in a new issue