Skip to content

Commit

Permalink
Support for HF Tokenizers (guillaume-be#408)
Browse files Browse the repository at this point in the history
* tokenizers output type conversion

* WIP hf Tokenizers support (2)

* finalize interface methods for hf tokenizers

* Addition of GPT2 example with hf tokenizers

* Made hf-tokenizers optional, added doc for HFTokenizer

* Addition of tests for hf tokenizers, addition to CI

* Updated changelog, extended documentation

* Fix Clippy warnings
  • Loading branch information
guillaume-be authored Aug 13, 2023
1 parent af3839e commit fd1e66b
Show file tree
Hide file tree
Showing 16 changed files with 1,096 additions and 64 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ jobs:
--test nllb
--features download-libtorch

test-onnx:
name: Integration tests (ONNX models)
test-opt-features:
name: Integration tests (Optional features)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand All @@ -155,7 +155,9 @@ jobs:
command: test
args: --package rust-bert
--features onnx
--features hf-tokenizers
--test onnx
--test hf_tokenizers
--features download-libtorch

convert-model:
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ All notable changes to this project will be documented in this file. The format

## [Unreleased]
## Added
- Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines
- Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines.
- Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files.

## Fixed
- (BREAKING) Fixed the keyword extraction pipeline for n-gram sizes > 2. Add new configuration option `tokenizer_forbidden_ngram_chars` to specify characters that should be excluded from n-grams (allows filtering m-grams spanning multiple sentences).
Expand Down
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ download-libtorch = ["tch/download-libtorch"]
onnx = ["ort", "ndarray"]
rustls-tls = ["cached-path/rustls-tls"]
default-tls = ["cached-path/default-tls"]
hf-tokenizers = ["tokenizers"]

[package.metadata.docs.rs]
features = ["doc-only"]
Expand All @@ -89,6 +90,7 @@ dirs = { version = "4", optional = true }
lazy_static = { version = "1", optional = true }
ort = {version="~1.14.8", optional = true, default-features = false, features = ["half"]}
ndarray = {version="0.15", optional = true}
tokenizers = {version="0.13.3", optional=true, default-features = false, features = ["onig"]}

[dev-dependencies]
anyhow = "1"
Expand Down
61 changes: 61 additions & 0 deletions examples/generation_gpt2_hf_tokenizers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
// Copyright 2019 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

extern crate anyhow;

use rust_bert::pipelines::common::{ModelType, TokenizerOption};
use rust_bert::pipelines::text_generation::{TextGenerationConfig, TextGenerationModel};
use rust_bert::resources::{RemoteResource, ResourceProvider};
use std::fs::File;
use std::io::Write;
use tempfile::TempDir;

fn main() -> anyhow::Result<()> {
// Set-up model
let generate_config = TextGenerationConfig {
model_type: ModelType::GPT2,
max_length: Some(30),
do_sample: false,
num_beams: 1,
temperature: 1.0,
num_return_sequences: 1,
..Default::default()
};

// Create tokenizer
let tmp_dir = TempDir::new()?;
let special_token_map_path = tmp_dir.path().join("special_token_map.json");
let mut tmp_file = File::create(&special_token_map_path)?;
writeln!(
tmp_file,
r#"{{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}}"#
)?;

let tokenizer_path = RemoteResource::from_pretrained((
"gpt2/tokenizer",
"https://huggingface.co/gpt2/resolve/main/tokenizer.json",
))
.get_local_path()?;
let tokenizer =
TokenizerOption::from_hf_tokenizer_file(tokenizer_path, special_token_map_path)?;

let model = TextGenerationModel::new_with_tokenizer(generate_config, tokenizer)?;

let input_context = "The dog";
// let second_input_context = "The cat was";
let output = model.generate(&[input_context], None);

for sentence in output {
println!("{sentence:?}");
}
Ok(())
}
4 changes: 3 additions & 1 deletion src/models/longt5/attention.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ fn make_global_fixed_block_ids(
attention_mask: &Tensor,
global_block_size: i64,
) -> (Tensor, Tensor) {
let &[batch_size, seq_length, ..] = attention_mask.size().as_slice() else {unreachable!()};
let &[batch_size, seq_length, ..] = attention_mask.size().as_slice() else {
unreachable!()
};

let handle_orphan_tokens = |block_ids: Tensor| -> Tensor {
let block_ends = Tensor::arange(seq_length, (Kind::Int64, block_ids.device()))
Expand Down
104 changes: 80 additions & 24 deletions src/pipelines/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,18 @@ use rust_tokenizers::{TokenIdsWithOffsets, TokenizedInput, TokensWithOffsets};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::convert::TryFrom;

use std::fmt::Debug;

use std::path::{Path, PathBuf};
use tch::{Device, Kind, Tensor};

#[cfg(feature = "onnx")]
use crate::pipelines::onnx::ONNXModelConfig;

#[cfg(feature = "hf-tokenizers")]
use crate::pipelines::hf_tokenizers::HFTokenizer;

#[derive(Debug, Default)]
/// Container for ONNX model resources, containing 3 optional resources (Encoder, Decoder and Decoder with past)
pub struct ONNXModelResources {
Expand Down Expand Up @@ -288,6 +293,9 @@ pub enum TokenizerOption {
FNet(FNetTokenizer),
/// Bart Tokenizer
Bart(RobertaTokenizer),
/// HF Tokenizer
#[cfg(feature = "hf-tokenizers")]
HFTokenizer(HFTokenizer),
}

impl ConfigOption {
Expand Down Expand Up @@ -913,28 +921,13 @@ impl TokenizerOption {
Ok(tokenizer)
}

/// Returns the model type
pub fn model_type(&self) -> ModelType {
match *self {
Self::Bert(_) => ModelType::Bert,
Self::Deberta(_) => ModelType::Deberta,
Self::DebertaV2(_) => ModelType::DebertaV2,
Self::Roberta(_) => ModelType::Roberta,
Self::Bart(_) => ModelType::Bart,
Self::XLMRoberta(_) => ModelType::XLMRoberta,
Self::Marian(_) => ModelType::Marian,
Self::T5(_) => ModelType::T5,
Self::Albert(_) => ModelType::Albert,
Self::XLNet(_) => ModelType::XLNet,
Self::GPT2(_) => ModelType::GPT2,
Self::OpenAiGpt(_) => ModelType::OpenAiGpt,
Self::Reformer(_) => ModelType::Reformer,
Self::ProphetNet(_) => ModelType::ProphetNet,
Self::Pegasus(_) => ModelType::Pegasus,
Self::MBart50(_) => ModelType::MBart,
Self::M2M100(_) | Self::NLLB(_) => ModelType::M2M100,
Self::FNet(_) => ModelType::FNet,
}
#[cfg(feature = "hf-tokenizers")]
pub fn from_hf_tokenizer_file<P: AsRef<Path>, S: AsRef<Path>>(
tokenizer_file: P,
special_token_map: S,
) -> Result<Self, RustBertError> {
let hf_tokenizer = HFTokenizer::from_file(tokenizer_file, special_token_map)?;
Ok(TokenizerOption::HFTokenizer(hf_tokenizer))
}

/// Interface method
Expand All @@ -946,7 +939,7 @@ impl TokenizerOption {
stride: usize,
) -> Vec<TokenizedInput>
where
S: AsRef<str> + Sync,
S: AsRef<str> + Send + Sync,
{
match *self {
Self::Bert(ref tokenizer) => MultiThreadedTokenizer::encode_list(
Expand Down Expand Up @@ -1082,6 +1075,8 @@ impl TokenizerOption {
truncation_strategy,
stride,
),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.encode_list(text_list).unwrap(),
}
}

Expand Down Expand Up @@ -1227,6 +1222,8 @@ impl TokenizerOption {
truncation_strategy,
stride,
),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.encode_pair_list(text_pair_list).unwrap(),
}
}

Expand Down Expand Up @@ -1297,6 +1294,8 @@ impl TokenizerOption {
Self::FNet(ref tokenizer) => {
tokenizer.encode(text_1, text_2, max_len, truncation_strategy, stride)
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.encode_pair(text_1, text_2).unwrap(),
}
}

Expand All @@ -1322,6 +1321,8 @@ impl TokenizerOption {
Self::M2M100(ref tokenizer) => tokenizer.tokenize(text),
Self::NLLB(ref tokenizer) => tokenizer.tokenize(text),
Self::FNet(ref tokenizer) => tokenizer.tokenize(text),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize(text),
}
}

Expand All @@ -1347,13 +1348,15 @@ impl TokenizerOption {
Self::M2M100(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
Self::NLLB(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
Self::FNet(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
}
}

/// Interface method to tokenization
pub fn tokenize_list<S>(&self, text: &[S]) -> Vec<Vec<String>>
where
S: AsRef<str> + Sync,
S: AsRef<str> + Send + Sync,
{
match *self {
Self::Bert(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
Expand Down Expand Up @@ -1383,6 +1386,8 @@ impl TokenizerOption {
Self::M2M100(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
Self::NLLB(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
Self::FNet(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize_list(text),
}
}

Expand Down Expand Up @@ -1451,6 +1456,8 @@ impl TokenizerOption {
Self::FNet(ref tokenizer) => {
tokenizer.decode(token_ids, skip_special_tokens, clean_up_tokenization_spaces)
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.decode(token_ids, skip_special_tokens),
}
}

Expand Down Expand Up @@ -1537,6 +1544,13 @@ impl TokenizerOption {
token_ids_with_offsets_1,
token_ids_with_offsets_2,
),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => {
return tokenizer.build_input_with_special_tokens(
token_ids_with_offsets_1,
token_ids_with_offsets_2,
)
}
};
TokenizedInput {
token_ids: token_ids_with_special_tokens.token_ids,
Expand Down Expand Up @@ -1736,6 +1750,8 @@ impl TokenizerOption {
Self::M2M100(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
Self::NLLB(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
Self::FNet(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
}
}

Expand Down Expand Up @@ -1818,6 +1834,10 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
vocab.token_to_id(vocab.get_unknown_value())
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => {
tokenizer.token_to_id(&tokenizer.special_token_map.unk_token)
}
}
}

Expand Down Expand Up @@ -1888,6 +1908,12 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
Some(vocab.token_to_id(vocab.get_pad_value()))
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer
.special_token_map
.pad_token
.as_ref()
.map(|token| tokenizer.token_to_id(token)),
Self::Reformer(_) => None,
Self::GPT2(_) => None,
Self::OpenAiGpt(_) => None,
Expand Down Expand Up @@ -1949,6 +1975,12 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
Some(vocab.token_to_id(vocab.get_sep_value()))
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer
.special_token_map
.sep_token
.as_ref()
.map(|token| tokenizer.token_to_id(token)),
Self::Marian(_) => None,
Self::T5(_) => None,
Self::GPT2(_) => None,
Expand Down Expand Up @@ -2009,6 +2041,12 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
Some(vocab.token_to_id(vocab.get_mask_value()))
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer
.special_token_map
.mask_token
.as_ref()
.map(|token| tokenizer.token_to_id(token)),
Self::Marian(_) => None,
Self::M2M100(_) => None,
Self::NLLB(_) => None,
Expand Down Expand Up @@ -2058,6 +2096,8 @@ impl TokenizerOption {
Self::Pegasus(ref tokenizer) => {
Some(MultiThreadedTokenizer::vocab(tokenizer).get_mask_value())
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer.special_token_map.mask_token.as_deref(),
Self::M2M100(_) => None,
Self::NLLB(_) => None,
Self::Marian(_) => None,
Expand Down Expand Up @@ -2111,6 +2151,12 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
Some(vocab.token_to_id(vocab.get_bos_value()))
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer
.special_token_map
.bos_token
.as_ref()
.map(|token| tokenizer.token_to_id(token)),
Self::MBart50(_) => Some(0),
Self::FNet(_) => None,
Self::Bert(_) => None,
Expand Down Expand Up @@ -2186,6 +2232,12 @@ impl TokenizerOption {
let vocab = MultiThreadedTokenizer::vocab(tokenizer);
Some(vocab.token_to_id(vocab.get_eos_value()))
}
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref tokenizer) => tokenizer
.special_token_map
.eos_token
.as_ref()
.map(|token| tokenizer.token_to_id(token)),
Self::FNet(_) => None,
Self::Bert(_) => None,
Self::ProphetNet(_) => None,
Expand Down Expand Up @@ -2264,6 +2316,8 @@ impl TokenizerOption {
Self::M2M100(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
Self::NLLB(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
Self::FNet(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
}
}

Expand All @@ -2289,6 +2343,8 @@ impl TokenizerOption {
Self::M2M100(ref mut tokenizer) => tokenizer.add_tokens(tokens),
Self::NLLB(ref mut tokenizer) => tokenizer.add_tokens(tokens),
Self::FNet(ref mut tokenizer) => tokenizer.add_tokens(tokens),
#[cfg(feature = "hf-tokenizers")]
Self::HFTokenizer(ref mut tokenizer) => tokenizer.add_tokens(tokens),
}
}
}
Loading

0 comments on commit fd1e66b

Please sign in to comment.