nixpkgs/pkgs/development/python-modules/tokenizers/default.nix

{ lib
, fetchFromGitHub
, fetchurl
, buildPythonPackage
, rustPlatform
, setuptools-rust
, numpy
, datasets
, pytestCheckHook
, requests
}:

let
  robertaVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
    sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
  };
  robertaMerges = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
    sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
  };
  albertVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
    sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
  };
  bertVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
    sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
  };
  norvigBig = fetchurl {
    url = "https://norvig.com/big.txt";
    sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
  };
  docPipelineTokenizer = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
    hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
  };
  docQuicktourTokenizer = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
    hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
  };
  openaiVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
    sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
  };
  openaiMerges = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
    sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
  };
in buildPythonPackage rec {
  pname = "tokenizers";
  version = "unstable-2021-08-13";

  src = fetchFromGitHub {
    owner = "huggingface";
    repo = pname;
    rev = "e7dd6436dd4a4ffd9e8a4f110ca68e6a38677cb6";
    sha256 = "1p7w9a43a9h6ys5nsa4g89l65dj11037p7a1lqkj4x1yc9kv2y1r";
  };

  cargoDeps = rustPlatform.fetchCargoTarball {
    inherit src sourceRoot;
    name = "${pname}-${version}";
    sha256 = "1yb4jsx6mp9jgd1g3mli6vr6mri2afnwqlmxq1rpvn34z6b3iw9q";
  };

  sourceRoot = "source/bindings/python";

  nativeBuildInputs = [ setuptools-rust ] ++ (with rustPlatform; [
    cargoSetupHook
    rust.cargo
    rust.rustc
  ]);

  propagatedBuildInputs = [
    numpy
  ];

  checkInputs = [
    datasets
    pytestCheckHook
    requests
  ];

  postUnpack = ''
    # Add data files for tests, otherwise tests attempt network access.
    mkdir $sourceRoot/tests/data
    ( cd $sourceRoot/tests/data
      ln -s ${robertaVocab} roberta-base-vocab.json
      ln -s ${robertaMerges} roberta-base-merges.txt
      ln -s ${albertVocab} albert-base-v1-tokenizer.json
      ln -s ${bertVocab} bert-base-uncased-vocab.txt
      ln -s ${docPipelineTokenizer} bert-wiki.json
      ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
      ln -s ${norvigBig} big.txt
      ln -s ${openaiVocab} openai-gpt-vocab.json
      ln -s ${openaiMerges} openai-gpt-merges.txt )
  '';

  postPatch = ''
    echo 'import multiprocessing; multiprocessing.set_start_method("fork")' >> tests/__init__.py
  '';

  preCheck = ''
    HOME=$TMPDIR
  '';

  disabledTests = [
    # Downloads data using the datasets module.
    "TestTrainFromIterators"
  ];

  meta = with lib; {
    homepage = "https://github.com/huggingface/tokenizers";
    description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
    license = licenses.asl20;
    platforms = platforms.unix;
    maintainers = with maintainers; [ ];
  };
}
treewide: remove stdenv where not needed 2021-01-25 09:26:54 +01:00			`{ lib`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`, fetchFromGitHub`
			`, fetchurl`
python3Packages.tokenizers: switch to buildPythonPackage Use the new cargoSetupHook to set up Cargo vendoring, so that we do not need buildRustPackage anymore. 2021-02-09 11:41:01 +01:00			`, buildPythonPackage`
			`, rustPlatform`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`, setuptools-rust`
			`, numpy`
python3Packages.tokenizers: 0.9.4 -> 0.10.0 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.10.0 2021-01-16 15:07:33 +01:00			`, datasets`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`, pytestCheckHook`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`, requests`
			`}:`

			`let`
			`robertaVocab = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";`
			`sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";`
			`};`
			`robertaMerges = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";`
			`sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";`
			`};`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`albertVocab = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";`
			`sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";`
			`};`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`bertVocab = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";`
			`sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";`
			`};`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`norvigBig = fetchurl {`
			`url = "https://norvig.com/big.txt";`
			`sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";`
			`};`
python3Packages.tokenizers: 0.9.2 -> 0.9.4 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.3 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.4 2020-11-10 18:16:13 +01:00			`docPipelineTokenizer = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";`
			`hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";`
			`};`
			`docQuicktourTokenizer = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";`
			`hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";`
			`};`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`openaiVocab = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";`
			`sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";`
			`};`
			`openaiMerges = fetchurl {`
			`url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";`
			`sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";`
			`};`
python3Packages.tokenizers: switch to buildPythonPackage Use the new cargoSetupHook to set up Cargo vendoring, so that we do not need buildRustPackage anymore. 2021-02-09 11:41:01 +01:00			`in buildPythonPackage rec {`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`pname = "tokenizers";`
python3Packages.tokenizers: 0.10.3 -> unstable-2021-08-13 2021-08-26 18:15:36 +02:00			`version = "unstable-2021-08-13";`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00
			`src = fetchFromGitHub {`
			`owner = "huggingface";`
			`repo = pname;`
python3Packages.tokenizers: 0.10.3 -> unstable-2021-08-13 2021-08-26 18:15:36 +02:00			`rev = "e7dd6436dd4a4ffd9e8a4f110ca68e6a38677cb6";`
			`sha256 = "1p7w9a43a9h6ys5nsa4g89l65dj11037p7a1lqkj4x1yc9kv2y1r";`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`};`

python3Packages.tokenizers: switch to buildPythonPackage Use the new cargoSetupHook to set up Cargo vendoring, so that we do not need buildRustPackage anymore. 2021-02-09 11:41:01 +01:00			`cargoDeps = rustPlatform.fetchCargoTarball {`
			`inherit src sourceRoot;`
			`name = "${pname}-${version}";`
python3Packages.tokenizers: 0.10.3 -> unstable-2021-08-13 2021-08-26 18:15:36 +02:00			`sha256 = "1yb4jsx6mp9jgd1g3mli6vr6mri2afnwqlmxq1rpvn34z6b3iw9q";`
python3Packages.tokenizers: switch to buildPythonPackage Use the new cargoSetupHook to set up Cargo vendoring, so that we do not need buildRustPackage anymore. 2021-02-09 11:41:01 +01:00			`};`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00
			`sourceRoot = "source/bindings/python";`

python3Packages.tokenizers: switch to buildPythonPackage Use the new cargoSetupHook to set up Cargo vendoring, so that we do not need buildRustPackage anymore. 2021-02-09 11:41:01 +01:00			`nativeBuildInputs = [ setuptools-rust ] ++ (with rustPlatform; [`
			`cargoSetupHook`
			`rust.cargo`
			`rust.rustc`
			`]);`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00
			`propagatedBuildInputs = [`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`numpy`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`];`

python3Packages.tokenizers: canonicalize tests Make handling of tests more like other Python derivations (and they actually run again). 2021-05-25 17:07:38 +02:00			`checkInputs = [`
python3Packages.tokenizers: 0.9.4 -> 0.10.0 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.10.0 2021-01-16 15:07:33 +01:00			`datasets`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`pytestCheckHook`
			`requests`
			`];`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00
			`postUnpack = ''`
			`# Add data files for tests, otherwise tests attempt network access.`
			`mkdir $sourceRoot/tests/data`
			`( cd $sourceRoot/tests/data`
			`ln -s ${robertaVocab} roberta-base-vocab.json`
			`ln -s ${robertaMerges} roberta-base-merges.txt`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`ln -s ${albertVocab} albert-base-v1-tokenizer.json`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`ln -s ${bertVocab} bert-base-uncased-vocab.txt`
python3Packages.tokenizers: 0.9.2 -> 0.9.4 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.3 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.4 2020-11-10 18:16:13 +01:00			`ln -s ${docPipelineTokenizer} bert-wiki.json`
			`ln -s ${docQuicktourTokenizer} tokenizer-wiki.json`
python3Packages.tokenizers: 0.8.1 -> 0.9.2 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`. 2020-10-16 11:40:35 +02:00			`ln -s ${norvigBig} big.txt`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`ln -s ${openaiVocab} openai-gpt-vocab.json`
			`ln -s ${openaiMerges} openai-gpt-merges.txt )`
			`'';`

python3Packages.tokenizers: 0.10.3 -> unstable-2021-08-13 2021-08-26 18:15:36 +02:00			`postPatch = ''`
			`echo 'import multiprocessing; multiprocessing.set_start_method("fork")' >> tests/__init__.py`
			`'';`

python3Packages.tokenizers: 0.9.4 -> 0.10.0 Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.10.0 2021-01-16 15:07:33 +01:00			`preCheck = ''`
			`HOME=$TMPDIR`
			`'';`

			`disabledTests = [`
			`# Downloads data using the datasets module.`
			`"TestTrainFromIterators"`
			`];`

treewide: with stdenv.lib; in meta -> with lib; Part of: https://github.com/NixOS/nixpkgs/issues/108938 meta = with stdenv.lib; is a widely used pattern. We want to slowly remove the `stdenv.lib` indirection and encourage people to use `lib` directly. Thus let’s start with the meta field. This used a rewriting script to mostly automatically replace all occurances of this pattern, and add the `lib` argument to the package header if it doesn’t exist yet. The script in its current form is available at https://cs.tvl.fyi/depot@2f807d7f141068d2d60676a89213eaa5353ca6e0/-/blob/users/Profpatsch/nixpkgs-rewriter/default.nix 2021-01-11 08:54:33 +01:00			`meta = with lib; {`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`homepage = "https://github.com/huggingface/tokenizers";`
			`description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";`
			`license = licenses.asl20;`
			`platforms = platforms.unix;`
treewide: remove danieldk as maintainer from a set of packages I currently do not have much time to work on nixpkgs. Remove myself as a maintainer from a bunch of packages to avoid that people are waiting on me for a review. 2021-09-12 16:42:12 +02:00			`maintainers = with maintainers; [ ];`
python3Packages.tokenizers: init at 0.8.0 2020-06-23 10:12:54 +02:00			`};`
			`}`