python3Packages.openai-triton: Use custom LLVM and refactor

This project does not rely on ROCM's LLVM fork! Using it can cause a lot of problems when it upgrades!
2023-10-06 18:18:24 -05:00 · 2023-10-06 18:18:24 -05:00 · 691b69fd80
commit 691b69fd80
parent e6f88a9a82
4 changed files with 188 additions and 4765 deletions
--- a/pkgs/development/python-modules/openai-triton/default.nix
+++ b/pkgs/development/python-modules/openai-triton/default.nix
@ -1,12 +1,13 @@
 { lib
+, callPackage
 , buildPythonPackage
-, python
-, fetchpatch
 , fetchFromGitHub
 , addOpenGLRunpath
+, pytestCheckHook
+, pythonRelaxDepsHook
+, pkgsTargetTarget
 , cmake
-, cudaPackages
-, llvmPackages
+, ninja
 , pybind11
 , gtest
 , zlib
@ -15,18 +16,11 @@
 , lit
 , filelock
 , torchWithRocm
-, pytest
-, pytestCheckHook
-, pythonRelaxDepsHook
-, pkgsTargetTarget
+, python
+, cudaPackages
 }:

 let
-  pname = "triton";
-  version = "2.0.0";
-
-  inherit (cudaPackages) cuda_cudart backendStdenv;
-
  # A time may come we'll want to be cross-friendly
  #
  # Short explanation: we need pkgsTargetTarget, because we use string
@ -38,20 +32,12 @@ let
  # pkgsTargetTarget maybe doesn't matter, because ptxas compiles programs to
  # be executed on the GPU.
  # Cf. https://nixos.org/manual/nixpkgs/unstable/#sec-cross-infra
-  ptxas = "${pkgsTargetTarget.cudaPackages.cuda_nvcc}/bin/ptxas";
-
-  llvm = (llvmPackages.llvm.override {
-    llvmTargetsToBuild = [ "NATIVE" "NVPTX" ];
-    # Upstream CI sets these too:
-    # targetProjects = [ "mlir" ];
-    extraCMakeFlags = [
-      "-DLLVM_INSTALL_UTILS=ON"
-    ];
-  });
+  ptxas = "${pkgsTargetTarget.cudaPackages.cuda_nvcc}/bin/ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
+  llvm = callPackage ./llvm.nix { }; # Use a custom llvm, see llvm.nix for details
 in
-buildPythonPackage {
-  inherit pname version;
-
+buildPythonPackage rec {
+  pname = "triton";
+  version = "2.0.0";
  format = "setuptools";

  src = fetchFromGitHub {
@ -62,21 +48,6 @@ buildPythonPackage {
  };

  patches = [
-    # Prerequisite for llvm15 patch
-    (fetchpatch {
-      url = "https://github.com/openai/triton/commit/2aba985daaa70234823ea8f1161da938477d3e02.patch";
-      hash = "sha256-LGv0+Ut2WYPC4Ksi4803Hwmhi3FyQOF9zElJc/JCobk=";
-    })
-    (fetchpatch {
-      url = "https://github.com/openai/triton/commit/e3941f9d09cdd31529ba4a41018cfc0096aafea6.patch";
-      hash = "sha256-A+Gor6qzFlGQhVVhiaaYOzqqx8yO2MdssnQS6TIfUWg=";
-    })
-
-    # Source: https://github.com/openai/triton/commit/fc7a8e35819bda632bdcf1cf75fd9abe4d4e077a.patch
-    # The original patch adds ptxas binary, so we include our own clean copy
-    # Drop with the next update
-    ./llvm15.patch
-
    # TODO: there have been commits upstream aimed at removing the "torch"
    # circular dependency, but the patches fail to apply on the release
    # revision. Keeping the link for future reference
@ -88,70 +59,11 @@ buildPythonPackage {
    # })
  ];

-  postPatch = ''
-    substituteInPlace python/setup.py \
-      --replace \
-        '= get_thirdparty_packages(triton_cache_path)' \
-        '= os.environ["cmakeFlags"].split()'
-  ''
-  # Wiring triton=2.0.0 with llcmPackages_rocm.llvm=5.4.3
-  # Revisit when updating either triton or llvm
-  + ''
-    substituteInPlace CMakeLists.txt \
-      --replace "nvptx" "NVPTX" \
-      --replace "LLVM 11" "LLVM"
-    sed -i '/AddMLIR/a set(MLIR_TABLEGEN_EXE "${llvmPackages.mlir}/bin/mlir-tblgen")' CMakeLists.txt
-    sed -i '/AddMLIR/a set(MLIR_INCLUDE_DIR ''${MLIR_INCLUDE_DIRS})' CMakeLists.txt
-    find -iname '*.td' -exec \
-      sed -i \
-      -e '\|include "mlir/IR/OpBase.td"|a include "mlir/IR/AttrTypeBase.td"' \
-      -e 's|include "mlir/Dialect/StandardOps/IR/Ops.td"|include "mlir/Dialect/Func/IR/FuncOps.td"|' \
-      '{}' ';'
-    substituteInPlace unittest/CMakeLists.txt --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
-    sed -i 's/^include.*$//' unittest/CMakeLists.txt
-    sed -i '/LINK_LIBS/i NVPTXInfo' lib/Target/PTX/CMakeLists.txt
-    sed -i '/LINK_LIBS/i NVPTXCodeGen' lib/Target/PTX/CMakeLists.txt
-  ''
-  # TritonMLIRIR already links MLIRIR. Not transitive?
-  # + ''
-  #   echo "target_link_libraries(TritonPTX PUBLIC MLIRIR)" >> lib/Target/PTX/CMakeLists.txt
-  # ''
-  # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
-  + ''
-    substituteInPlace bin/CMakeLists.txt \
-      --replace "add_subdirectory(FileCheck)" ""
-
-    rm cmake/FindLLVM.cmake
-  ''
-  +
-  (
-    let
-      # Bash was getting weird without linting,
-      # but basically upstream contains [cc, ..., "-lcuda", ...]
-      # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
-      old = [ "-lcuda" ];
-      new = [ "-lcuda" "-L${addOpenGLRunpath.driverLink}" "-L${cuda_cudart}/lib/stubs/" ];
-
-      quote = x: ''"${x}"'';
-      oldStr = lib.concatMapStringsSep ", " quote old;
-      newStr = lib.concatMapStringsSep ", " quote new;
-    in
-    ''
-      substituteInPlace python/triton/compiler.py \
-        --replace '${oldStr}' '${newStr}'
-    ''
-  )
-  # Triton seems to be looking up cuda.h
-  + ''
-    sed -i 's|cu_include_dir = os.path.join.*$|cu_include_dir = "${cuda_cudart}/include"|' python/triton/compiler.py
-  '';
-
  nativeBuildInputs = [
-    cmake
    pythonRelaxDepsHook
-
-    # Requires torch (circular dependency) and probably needs GPUs:
-    # pytestCheckHook
+    # pytestCheckHook # Requires torch (circular dependency) and probably needs GPUs:
+    cmake
+    ninja

    # Note for future:
    # These *probably* should go in depsTargetTarget
@ -159,7 +71,6 @@ buildPythonPackage {
    # because we only support cudaPackages on x86_64-linux atm
    lit
    llvm
-    llvmPackages.mlir
  ];

  buildInputs = [
@ -170,17 +81,44 @@ buildPythonPackage {
    zlib
  ];

-  propagatedBuildInputs = [
-    filelock
-  ];
+  propagatedBuildInputs = [ filelock ];
+
+  postPatch = let
+    # Bash was getting weird without linting,
+    # but basically upstream contains [cc, ..., "-lcuda", ...]
+    # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
+    old = [ "-lcuda" ];
+    new = [ "-lcuda" "-L${addOpenGLRunpath.driverLink}" "-L${cudaPackages.cuda_cudart}/lib/stubs/" ];
+
+    quote = x: ''"${x}"'';
+    oldStr = lib.concatMapStringsSep ", " quote old;
+    newStr = lib.concatMapStringsSep ", " quote new;
+  in ''
+    # Use our `cmakeFlags` instead and avoid downloading dependencies
+    substituteInPlace python/setup.py \
+      --replace "= get_thirdparty_packages(triton_cache_path)" "= os.environ[\"cmakeFlags\"].split()"
+
+    # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
+    substituteInPlace bin/CMakeLists.txt \
+      --replace "add_subdirectory(FileCheck)" ""
+
+    # Use our linker flags
+    substituteInPlace python/triton/compiler.py \
+      --replace '${oldStr}' '${newStr}'
+
+    # Don't fetch googletest
+    substituteInPlace unittest/CMakeLists.txt \
+      --replace "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\
+      --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
+  '';

  # Avoid GLIBCXX mismatch with other cuda-enabled python packages
  preConfigure = ''
-    export CC="${backendStdenv.cc}/bin/cc";
-    export CXX="${backendStdenv.cc}/bin/c++";
+    export CC=${cudaPackages.backendStdenv.cc}/bin/cc;
+    export CXX=${cudaPackages.backendStdenv.cc}/bin/c++;

    # Upstream's setup.py tries to write cache somewhere in ~/
-    export HOME=$TMPDIR
+    export HOME=$(mktemp -d)

    # Upstream's github actions patch setup.cfg to write base-dir. May be redundant
    echo "
@ -188,52 +126,41 @@ buildPythonPackage {
    base-dir=$PWD" >> python/setup.cfg

    # The rest (including buildPhase) is relative to ./python/
-    cd python/
+    cd python

    # Work around download_and_copy_ptxas()
-    dst_cuda="$PWD/triton/third_party/cuda/bin"
-    mkdir -p "$dst_cuda"
-    ln -s "${ptxas}" "$dst_cuda/"
+    mkdir -p $PWD/triton/third_party/cuda/bin
+    ln -s ${ptxas} $PWD/triton/third_party/cuda/bin
  '';

  # CMake is run by setup.py instead
  dontUseCmakeConfigure = true;
-  cmakeFlags = [
-    "-DMLIR_DIR=${llvmPackages.mlir}/lib/cmake/mlir"
-  ];

-  postFixup =
-    let
-      ptxasDestination = "$out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas";
-    in
-    # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
-    ''
-      rm -f ${ptxasDestination}
-      ln -s ${ptxas} ${ptxasDestination}
-    '';
+  # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
+  postFixup = ''
+    rm -f $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
+    ln -s ${ptxas} $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
+  '';

-  checkInputs = [
-    cmake # ctest
-  ];
+  checkInputs = [ cmake ]; # ctest
  dontUseSetuptoolsCheck = true;
-  preCheck =
+
+  preCheck = ''
    # build/temp* refers to build_ext.build_temp (looked up in the build logs)
-    ''
-      (cd /build/source/python/build/temp* ; ctest)
-    '' # For pytestCheckHook
-    + ''
-      cd test/unit
-    '';
-  pythonImportsCheck = [
-    # Circular dependency on torch
-    # "triton"
-    # "triton.language"
-  ];
+    (cd /build/source/python/build/temp* ; ctest)
+
+    # For pytestCheckHook
+    cd test/unit
+  '';
+
+  # Circular dependency on torch
+  # pythonImportsCheck = [
+  #   "triton"
+  #   "triton.language"
+  # ];

  # Ultimately, torch is our test suite:
-  passthru.tests = {
-    inherit torchWithRocm;
-  };
+  passthru.tests = { inherit torchWithRocm; };

  pythonRemoveDeps = [
    # Circular dependency, cf. https://github.com/openai/triton/issues/1374
@ -243,11 +170,12 @@ buildPythonPackage {
    "cmake"
    "lit"
  ];
+
  meta = with lib; {
-    description = "Development repository for the Triton language and compiler";
-    homepage = "https://github.com/openai/triton/";
+    description = "Language and compiler for writing highly efficient custom Deep-Learning primitives";
+    homepage = "https://github.com/openai/triton";
    platforms = lib.platforms.unix;
    license = licenses.mit;
-    maintainers = with maintainers; [ SomeoneSerge ];
+    maintainers = with maintainers; [ SomeoneSerge Madouura ];
  };
 }
--- a/pkgs/development/python-modules/openai-triton/llvm.nix
+++ b/pkgs/development/python-modules/openai-triton/llvm.nix
@ -0,0 +1,112 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, pkg-config
+, cmake
+, ninja
+, git
+, doxygen
+, sphinx
+, libxml2
+, libxcrypt
+, libedit
+, libffi
+, mpfr
+, zlib
+, ncurses
+, python3Packages
+, buildDocs ? true
+, buildMan ? true
+, buildTests ? true
+}:
+
+stdenv.mkDerivation (finalAttrs: {
+  pname = "triton-llvm";
+  version = "14.0.6-f28c006a5895";
+
+  outputs = [
+    "out"
+  ] ++ lib.optionals buildDocs [
+    "doc"
+  ] ++ lib.optionals buildMan [
+    "man"
+  ];
+
+  # See https://github.com/openai/triton/blob/main/python/setup.py and https://github.com/ptillet/triton-llvm-releases/releases
+  src = fetchFromGitHub {
+    owner = "llvm";
+    repo = "llvm-project";
+    rev = "f28c006a5895fc0e329fe15fead81e37457cb1d1";
+    hash = "sha256-vffu4HilvYwtzwgq+NlS26m65DGbp6OSSne2aje1yJE=";
+  };
+
+  nativeBuildInputs = [
+    pkg-config
+    cmake
+    ninja
+    git
+    python3Packages.python
+  ] ++ lib.optionals (buildDocs || buildMan) [
+    doxygen
+    sphinx
+    python3Packages.recommonmark
+  ];
+
+  buildInputs = [
+    libxml2
+    libxcrypt
+    libedit
+    libffi
+    mpfr
+  ];
+
+  propagatedBuildInputs = [
+    zlib
+    ncurses
+  ];
+
+  sourceRoot = "${finalAttrs.src.name}/llvm";
+
+  cmakeFlags = [
+    "-DLLVM_TARGETS_TO_BUILD=X86;AMDGPU;NVPTX"
+    "-DLLVM_ENABLE_PROJECTS=llvm;mlir"
+    "-DLLVM_INSTALL_UTILS=ON"
+  ] ++ lib.optionals (buildDocs || buildMan) [
+    "-DLLVM_INCLUDE_DOCS=ON"
+    "-DMLIR_INCLUDE_DOCS=ON"
+    "-DLLVM_BUILD_DOCS=ON"
+    # "-DLLVM_ENABLE_DOXYGEN=ON" Way too slow, only uses one core
+    "-DLLVM_ENABLE_SPHINX=ON"
+    "-DSPHINX_OUTPUT_HTML=ON"
+    "-DSPHINX_OUTPUT_MAN=ON"
+    "-DSPHINX_WARNINGS_AS_ERRORS=OFF"
+  ] ++ lib.optionals buildTests [
+    "-DLLVM_INCLUDE_TESTS=ON"
+    "-DMLIR_INCLUDE_TESTS=ON"
+    "-DLLVM_BUILD_TESTS=ON"
+  ];
+
+  postPatch = ''
+    # `CMake Error: cannot write to file "/build/source/llvm/build/lib/cmake/mlir/MLIRTargets.cmake": Permission denied`
+    chmod +w -R ../mlir
+
+    # FileSystem permissions tests fail with various special bits
+    rm test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test
+    rm unittests/Support/Path.cpp
+
+    substituteInPlace unittests/Support/CMakeLists.txt \
+      --replace "Path.cpp" ""
+  '';
+
+  doCheck = buildTests;
+  requiredSystemFeatures = [ "big-parallel" ];
+
+  meta = with lib; {
+    description = "Collection of modular and reusable compiler and toolchain technologies";
+    homepage = "https://github.com/llvm/llvm-project";
+    license = with licenses; [ ncsa ];
+    maintainers = with maintainers; [ SomeoneSerge Madouura ];
+    platforms = platforms.linux;
+    broken = stdenv.isAarch64; # https://github.com/RadeonOpenCompute/ROCm/issues/1831#issuecomment-1278205344
+  };
+})
--- a/pkgs/development/python-modules/openai-triton/llvm15.patch
+++ b/pkgs/development/python-modules/openai-triton/llvm15.patch
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@ -8351,7 +8351,7 @@ self: super: with self; {

  open-meteo = callPackage ../development/python-modules/open-meteo { };

-  openai-triton = callPackage ../development/python-modules/openai-triton { llvmPackages = pkgs.rocmPackages.llvm; };
+  openai-triton = callPackage ../development/python-modules/openai-triton { cudaPackages = pkgs.cudaPackages_12_0; };

  openai-triton-bin = callPackage ../development/python-modules/openai-triton/bin.nix { };