diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 0dcc2fdba2d9..1fa790686cac 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -1,4 +1,4 @@
-{ stdenv, lib, fetchFromGitHub, buildPythonPackage, python,
+{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
   config, cudaSupport ? config.cudaSupport, cudaPackages, magma,
   useSystemNccl ? true,
   MPISupport ? false, mpi,
@@ -52,17 +52,8 @@
 
 let
   inherit (lib) lists strings trivial;
-  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
-in
+  inherit (cudaPackages) cudaFlags cudnn nccl;
 
-assert cudaSupport -> stdenv.isLinux;
-assert cudaSupport -> (cudaPackages.cudaMajorVersion == "11");
-
-# confirm that cudatoolkits are sync'd across dependencies
-assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
-assert !cudaSupport || magma.cudaPackages.cudatoolkit == cudatoolkit;
-
-let
   setBool = v: if v then "1" else "0";
 
   # https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744
@@ -103,23 +94,6 @@ let
       throw "No GPU targets specified"
   );
 
-  cudatoolkit_joined = symlinkJoin {
-    name = "${cudatoolkit.name}-unsplit";
-    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
-    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
-  };
-
-  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
-  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
-  # libcuda.so from cudatoolkit for running tests, so that we don’t have
-  # to recompile pytorch on every update to nvidia-x11 or the kernel.
-  cudaStub = linkFarm "cuda-stub" [{
-    name = "libcuda.so.1";
-    path = "${cudatoolkit}/lib/stubs/libcuda.so";
-  }];
-  cudaStubEnv = lib.optionalString cudaSupport
-    "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
-
   rocmtoolkit_joined = symlinkJoin {
     name = "rocm-merged";
 
@@ -160,6 +134,12 @@ in buildPythonPackage rec {
     # base is 10.12. Until we upgrade, we can fall back on the older
     # pthread support.
     ./pthreadpool-disable-gcd.diff
+  ] ++ lib.optionals stdenv.isLinux [
+    # Propagate CUPTI to Kineto by overriding the search path with environment variables.
+    (fetchpatch {
+      url = "https://github.com/pytorch/pytorch/pull/108847/commits/7ae4d7c0e2dec358b4fe81538efe9da5eb580ec9.patch";
+      hash = "sha256-skFaDg98xcJqJfzxWk+qhUxPLHDStqvd0mec3PgksIg=";
+    })
   ];
 
   postPatch = lib.optionalString rocmSupport ''
@@ -184,6 +164,13 @@ in buildPythonPackage rec {
       --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
         "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
   ''
+  # Detection of NCCL version doesn't work particularly well when using the static binary.
+  + lib.optionalString cudaSupport ''
+    substituteInPlace cmake/Modules/FindNCCL.cmake \
+      --replace \
+        'message(FATAL_ERROR "Found NCCL header version and library version' \
+        'message(WARNING "Found NCCL header version and library version'
+  ''
   # error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
   # This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
   + lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") ''
@@ -192,12 +179,16 @@ in buildPythonPackage rec {
     inline void *aligned_alloc(size_t align, size_t size)'
   '';
 
+  # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
+  # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
+  # without extreme care to ensure they don't lock each other out of shared resources.
+  # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
   preConfigure = lib.optionalString cudaSupport ''
     export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
-    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
-  '' + lib.optionalString (cudaSupport && cudnn != null) ''
     export CUDNN_INCLUDE_DIR=${cudnn.dev}/include
     export CUDNN_LIB_DIR=${cudnn.lib}/lib
+    export CUPTI_INCLUDE_DIR=${cudaPackages.cuda_cupti.dev}/include
+    export CUPTI_LIBRARY_DIR=${cudaPackages.cuda_cupti.lib}/lib
   '' + lib.optionalString rocmSupport ''
     export ROCM_PATH=${rocmtoolkit_joined}
     export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
@@ -256,6 +247,7 @@ in buildPythonPackage rec {
   PYTORCH_BUILD_NUMBER = 0;
 
   USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
+  USE_STATIC_NCCL = setBool useSystemNccl;
 
   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
   # (upstream seems to have fixed this in the wrong place?)
@@ -286,12 +278,43 @@ in buildPythonPackage rec {
     pybind11
     pythonRelaxDepsHook
     removeReferencesTo
-  ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ]
-    ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
+  ] ++ lib.optionals cudaSupport (with cudaPackages; [
+    autoAddOpenGLRunpathHook
+    cuda_nvcc
+  ])
+  ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
 
   buildInputs = [ blas blas.provider pybind11 ]
     ++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
-    ++ lib.optionals cudaSupport [ cudnn.dev cudnn.lib nccl ]
+    ++ lib.optionals cudaSupport (with cudaPackages; [
+      cuda_cccl.dev # <thrust/*>
+      cuda_cudart # cuda_runtime.h and libraries
+      cuda_cupti.dev # For kineto
+      cuda_cupti.lib # For kineto
+      cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too
+      cuda_nvml_dev.dev # <nvml.h>
+      cuda_nvrtc.dev
+      cuda_nvrtc.lib
+      cuda_nvtx.dev
+      cuda_nvtx.lib # -llibNVToolsExt
+      cudnn.dev
+      cudnn.lib
+      libcublas.dev
+      libcublas.lib
+      libcufft.dev
+      libcufft.lib
+      libcurand.dev
+      libcurand.lib
+      libcusolver.dev
+      libcusolver.lib
+      libcusparse.dev
+      libcusparse.lib
+      nccl.dev # Provides nccl.h AND a static copy of NCCL!
+    ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
+      cuda_nvprof.dev # <cuda_profiler_api.h>
+    ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
+      cuda_profiler_api.dev # <cuda_profiler_api.h>
+    ])
     ++ lib.optionals rocmSupport [ openmp ]
     ++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
     ++ lib.optionals stdenv.isLinux [ numactl ]
@@ -335,7 +358,6 @@ in buildPythonPackage rec {
 
   checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
     "runHook preCheck"
-    cudaStubEnv
     "${python.interpreter} test/run_test.py"
     "--exclude"
     (concatStringsSep " " [
@@ -419,6 +441,17 @@ in buildPythonPackage rec {
     license = licenses.bsd3;
     maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
     platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
-    broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
+    broken = builtins.any trivial.id [
+      # CUDA and ROCm are mutually exclusive
+      (cudaSupport && rocmSupport)
+      # CUDA is only supported on Linux
+      (cudaSupport && !stdenv.isLinux)
+      # Only CUDA 11 is currently supported
+      (cudaSupport && (cudaPackages.cudaMajorVersion != "11"))
+      # MPI cudatoolkit does not match cudaPackages.cudatoolkit
+      (MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit))
+      # Magma cudaPackages does not match cudaPackages
+      (cudaSupport && (magma.cudaPackages != cudaPackages))
+    ];
   };
 }