# Copyright 2023-2025 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 # Autogenerated by pycargoebuild 0.15.0 EAPI=8 DISTUTILS_USE_PEP517=maturin PYTHON_COMPAT=( python3_{10..13} ) DISTUTILS_EXT=1 DISTUTILS_SINGLE_IMPL=1 RUST_MIN_VER="1.82.0" CRATES=" " inherit cargo distutils-r1 DESCRIPTION="Implementation of today's most used tokenizers" HOMEPAGE="https://github.com/huggingface/tokenizers" SRC_URI=" https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz -> ${P}.gh.tar.gz ${CARGO_CRATE_URIS} " if [[ ${PKGBUMPING} != ${PVR} ]]; then SRC_URI+=" https://dev.gentoo.org/~tupone/distfiles/${P}-crates.tar.xz https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz " fi LICENSE="Apache-2.0" # Dependent crate licenses LICENSE+=" Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0 Unicode-DFS-2016 " SLOT="0" KEYWORDS="~amd64" RDEPEND="dev-libs/oniguruma" BDEPEND=" test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] ) $(python_gen_cond_dep ' dev-python/setuptools-rust[${PYTHON_USEDEP}] ') " distutils_enable_tests pytest QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so" src_unpack() { cargo_src_unpack } pkg_setup() { python-single-r1_pkg_setup rust_pkg_setup } src_prepare() { default cd bindings/python eapply "${FILESDIR}"/${PN}-0.21.2-test.patch distutils-r1_src_prepare } src_configure() { cd tokenizers cargo_src_configure cd ../bindings/python distutils-r1_src_configure } src_compile() { export RUSTONIG_SYSTEM_LIBONIG=1 cd tokenizers cargo_src_compile cd ../bindings/python distutils-r1_src_compile } src_test() { cd tokenizers # Tests do not work #cargo_src_test cd ../bindings/python local -x EPYTEST_IGNORE=( benches/ ) local -x EPYTEST_DESELECT=( tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids tests/bindings/test_encoding.py::TestEncoding::test_n_sequences tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars tests/bindings/test_encoding.py::TestEncoding::test_token_to_word tests/bindings/test_encoding.py::TestEncoding::test_char_to_token tests/bindings/test_encoding.py::TestEncoding::test_char_to_word tests/bindings/test_encoding.py::TestEncoding::test_truncation tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction tests/bindings/test_models.py::TestBPE::test_instantiate tests/bindings/test_models.py::TestWordLevel::test_instantiate tests/bindings/test_models.py::TestWordPiece::test_instantiate tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch tests/bindings/test_trainers.py::TestUnigram::test_train tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer tests/documentation/test_pipeline.py::TestPipeline::test_pipeline tests/documentation/test_pipeline.py::TestPipeline::test_bert_example tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism tests/test_serialization.py::TestSerialization::test_full_serialization_albert tests/test_serialization.py::TestSerialization::test_str_big tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism ) distutils-r1_src_test } src_install() { cd tokenizers cd ../bindings/python distutils-r1_src_install }