Support for HF Tokenizers (guillaume-be#408)

* tokenizers output type conversion * WIP hf Tokenizers support (2) * finalize interface methods for hf tokenizers * Addition of GPT2 example with hf tokenizers * Made hf-tokenizers optional, added doc for HFTokenizer * Addition of tests for hf tokenizers, addition to CI * Updated changelog, extended documentation * Fix Clippy warnings
ShabbirHasan1 · Aug 13, 2023 · fd1e66b · fd1e66b
1 parent af3839e
commit fd1e66b
Show file tree

Hide file tree

Showing 16 changed files with 1,096 additions and 64 deletions.
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -140,8 +140,8 @@ jobs:
             --test nllb
             --features download-libtorch
 
-  test-onnx:
-    name: Integration tests (ONNX models)
+  test-opt-features:
+    name: Integration tests (Optional features)
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
@@ -155,7 +155,9 @@ jobs:
           command: test
           args: --package rust-bert
             --features onnx
+            --features hf-tokenizers
             --test onnx
+            --test hf_tokenizers
             --features download-libtorch
 
   convert-model:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,8 @@ All notable changes to this project will be documented in this file. The format
 
 ## [Unreleased]
 ## Added
-- Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines
+- Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines.
+- Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files. 
 
 ## Fixed
 - (BREAKING) Fixed the keyword extraction pipeline for n-gram sizes > 2. Add new configuration option `tokenizer_forbidden_ngram_chars` to specify characters that should be excluded from n-grams (allows filtering m-grams spanning multiple sentences).

diff --git a/Cargo.toml b/Cargo.toml
@@ -69,6 +69,7 @@ download-libtorch = ["tch/download-libtorch"]
 onnx = ["ort", "ndarray"]
 rustls-tls = ["cached-path/rustls-tls"]
 default-tls = ["cached-path/default-tls"]
+hf-tokenizers = ["tokenizers"]
 
 [package.metadata.docs.rs]
 features = ["doc-only"]
@@ -89,6 +90,7 @@ dirs = { version = "4", optional = true }
 lazy_static = { version = "1", optional = true }
 ort = {version="~1.14.8", optional = true, default-features = false, features = ["half"]}
 ndarray = {version="0.15", optional = true}
+tokenizers = {version="0.13.3", optional=true, default-features = false, features = ["onig"]}
 
 [dev-dependencies]
 anyhow = "1"

diff --git a/examples/generation_gpt2_hf_tokenizers.rs b/examples/generation_gpt2_hf_tokenizers.rs
@@ -0,0 +1,61 @@
+// Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+// Copyright 2019 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern crate anyhow;
+
+use rust_bert::pipelines::common::{ModelType, TokenizerOption};
+use rust_bert::pipelines::text_generation::{TextGenerationConfig, TextGenerationModel};
+use rust_bert::resources::{RemoteResource, ResourceProvider};
+use std::fs::File;
+use std::io::Write;
+use tempfile::TempDir;
+
+fn main() -> anyhow::Result<()> {
+    //    Set-up model
+    let generate_config = TextGenerationConfig {
+        model_type: ModelType::GPT2,
+        max_length: Some(30),
+        do_sample: false,
+        num_beams: 1,
+        temperature: 1.0,
+        num_return_sequences: 1,
+        ..Default::default()
+    };
+
+    // Create tokenizer
+    let tmp_dir = TempDir::new()?;
+    let special_token_map_path = tmp_dir.path().join("special_token_map.json");
+    let mut tmp_file = File::create(&special_token_map_path)?;
+    writeln!(
+        tmp_file,
+        r#"{{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}}"#
+    )?;
+
+    let tokenizer_path = RemoteResource::from_pretrained((
+        "gpt2/tokenizer",
+        "https://huggingface.co/gpt2/resolve/main/tokenizer.json",
+    ))
+    .get_local_path()?;
+    let tokenizer =
+        TokenizerOption::from_hf_tokenizer_file(tokenizer_path, special_token_map_path)?;
+
+    let model = TextGenerationModel::new_with_tokenizer(generate_config, tokenizer)?;
+
+    let input_context = "The dog";
+    // let second_input_context = "The cat was";
+    let output = model.generate(&[input_context], None);
+
+    for sentence in output {
+        println!("{sentence:?}");
+    }
+    Ok(())
+}
diff --git a/src/models/longt5/attention.rs b/src/models/longt5/attention.rs
@@ -114,7 +114,9 @@ fn make_global_fixed_block_ids(
     attention_mask: &Tensor,
     global_block_size: i64,
 ) -> (Tensor, Tensor) {
-    let &[batch_size, seq_length, ..] = attention_mask.size().as_slice() else {unreachable!()};
+    let &[batch_size, seq_length, ..] = attention_mask.size().as_slice() else {
+        unreachable!()
+    };
 
     let handle_orphan_tokens = |block_ids: Tensor| -> Tensor {
         let block_ends = Tensor::arange(seq_length, (Kind::Int64, block_ids.device()))

diff --git a/src/pipelines/common.rs b/src/pipelines/common.rs
@@ -56,13 +56,18 @@ use rust_tokenizers::{TokenIdsWithOffsets, TokenizedInput, TokensWithOffsets};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::convert::TryFrom;
+
 use std::fmt::Debug;
+
 use std::path::{Path, PathBuf};
 use tch::{Device, Kind, Tensor};
 
 #[cfg(feature = "onnx")]
 use crate::pipelines::onnx::ONNXModelConfig;
 
+#[cfg(feature = "hf-tokenizers")]
+use crate::pipelines::hf_tokenizers::HFTokenizer;
+
 #[derive(Debug, Default)]
 /// Container for ONNX model resources, containing 3 optional resources (Encoder, Decoder and Decoder with past)
 pub struct ONNXModelResources {
@@ -288,6 +293,9 @@ pub enum TokenizerOption {
     FNet(FNetTokenizer),
     /// Bart Tokenizer
     Bart(RobertaTokenizer),
+    /// HF Tokenizer
+    #[cfg(feature = "hf-tokenizers")]
+    HFTokenizer(HFTokenizer),
 }
 
 impl ConfigOption {
@@ -913,28 +921,13 @@ impl TokenizerOption {
         Ok(tokenizer)
     }
 
-    /// Returns the model type
-    pub fn model_type(&self) -> ModelType {
-        match *self {
-            Self::Bert(_) => ModelType::Bert,
-            Self::Deberta(_) => ModelType::Deberta,
-            Self::DebertaV2(_) => ModelType::DebertaV2,
-            Self::Roberta(_) => ModelType::Roberta,
-            Self::Bart(_) => ModelType::Bart,
-            Self::XLMRoberta(_) => ModelType::XLMRoberta,
-            Self::Marian(_) => ModelType::Marian,
-            Self::T5(_) => ModelType::T5,
-            Self::Albert(_) => ModelType::Albert,
-            Self::XLNet(_) => ModelType::XLNet,
-            Self::GPT2(_) => ModelType::GPT2,
-            Self::OpenAiGpt(_) => ModelType::OpenAiGpt,
-            Self::Reformer(_) => ModelType::Reformer,
-            Self::ProphetNet(_) => ModelType::ProphetNet,
-            Self::Pegasus(_) => ModelType::Pegasus,
-            Self::MBart50(_) => ModelType::MBart,
-            Self::M2M100(_) | Self::NLLB(_) => ModelType::M2M100,
-            Self::FNet(_) => ModelType::FNet,
-        }
+    #[cfg(feature = "hf-tokenizers")]
+    pub fn from_hf_tokenizer_file<P: AsRef<Path>, S: AsRef<Path>>(
+        tokenizer_file: P,
+        special_token_map: S,
+    ) -> Result<Self, RustBertError> {
+        let hf_tokenizer = HFTokenizer::from_file(tokenizer_file, special_token_map)?;
+        Ok(TokenizerOption::HFTokenizer(hf_tokenizer))
     }
 
     /// Interface method
@@ -946,7 +939,7 @@ impl TokenizerOption {
         stride: usize,
     ) -> Vec<TokenizedInput>
     where
-        S: AsRef<str> + Sync,
+        S: AsRef<str> + Send + Sync,
     {
         match *self {
             Self::Bert(ref tokenizer) => MultiThreadedTokenizer::encode_list(
@@ -1082,6 +1075,8 @@ impl TokenizerOption {
                 truncation_strategy,
                 stride,
             ),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.encode_list(text_list).unwrap(),
         }
     }
 
@@ -1227,6 +1222,8 @@ impl TokenizerOption {
                 truncation_strategy,
                 stride,
             ),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.encode_pair_list(text_pair_list).unwrap(),
         }
     }
 
@@ -1297,6 +1294,8 @@ impl TokenizerOption {
             Self::FNet(ref tokenizer) => {
                 tokenizer.encode(text_1, text_2, max_len, truncation_strategy, stride)
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.encode_pair(text_1, text_2).unwrap(),
         }
     }
 
@@ -1322,6 +1321,8 @@ impl TokenizerOption {
             Self::M2M100(ref tokenizer) => tokenizer.tokenize(text),
             Self::NLLB(ref tokenizer) => tokenizer.tokenize(text),
             Self::FNet(ref tokenizer) => tokenizer.tokenize(text),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize(text),
         }
     }
 
@@ -1347,13 +1348,15 @@ impl TokenizerOption {
             Self::M2M100(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
             Self::NLLB(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
             Self::FNet(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize_with_offsets(text),
         }
     }
 
     /// Interface method to tokenization
     pub fn tokenize_list<S>(&self, text: &[S]) -> Vec<Vec<String>>
     where
-        S: AsRef<str> + Sync,
+        S: AsRef<str> + Send + Sync,
     {
         match *self {
             Self::Bert(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
@@ -1383,6 +1386,8 @@ impl TokenizerOption {
             Self::M2M100(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
             Self::NLLB(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
             Self::FNet(ref tokenizer) => MultiThreadedTokenizer::tokenize_list(tokenizer, text),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.tokenize_list(text),
         }
     }
 
@@ -1451,6 +1456,8 @@ impl TokenizerOption {
             Self::FNet(ref tokenizer) => {
                 tokenizer.decode(token_ids, skip_special_tokens, clean_up_tokenization_spaces)
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.decode(token_ids, skip_special_tokens),
         }
     }
 
@@ -1537,6 +1544,13 @@ impl TokenizerOption {
                 token_ids_with_offsets_1,
                 token_ids_with_offsets_2,
             ),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => {
+                return tokenizer.build_input_with_special_tokens(
+                    token_ids_with_offsets_1,
+                    token_ids_with_offsets_2,
+                )
+            }
         };
         TokenizedInput {
             token_ids: token_ids_with_special_tokens.token_ids,
@@ -1736,6 +1750,8 @@ impl TokenizerOption {
             Self::M2M100(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
             Self::NLLB(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
             Self::FNet(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.convert_tokens_to_ids(tokens),
         }
     }
 
@@ -1818,6 +1834,10 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 vocab.token_to_id(vocab.get_unknown_value())
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => {
+                tokenizer.token_to_id(&tokenizer.special_token_map.unk_token)
+            }
         }
     }
 
@@ -1888,6 +1908,12 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 Some(vocab.token_to_id(vocab.get_pad_value()))
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer
+                .special_token_map
+                .pad_token
+                .as_ref()
+                .map(|token| tokenizer.token_to_id(token)),
             Self::Reformer(_) => None,
             Self::GPT2(_) => None,
             Self::OpenAiGpt(_) => None,
@@ -1949,6 +1975,12 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 Some(vocab.token_to_id(vocab.get_sep_value()))
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer
+                .special_token_map
+                .sep_token
+                .as_ref()
+                .map(|token| tokenizer.token_to_id(token)),
             Self::Marian(_) => None,
             Self::T5(_) => None,
             Self::GPT2(_) => None,
@@ -2009,6 +2041,12 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 Some(vocab.token_to_id(vocab.get_mask_value()))
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer
+                .special_token_map
+                .mask_token
+                .as_ref()
+                .map(|token| tokenizer.token_to_id(token)),
             Self::Marian(_) => None,
             Self::M2M100(_) => None,
             Self::NLLB(_) => None,
@@ -2058,6 +2096,8 @@ impl TokenizerOption {
             Self::Pegasus(ref tokenizer) => {
                 Some(MultiThreadedTokenizer::vocab(tokenizer).get_mask_value())
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer.special_token_map.mask_token.as_deref(),
             Self::M2M100(_) => None,
             Self::NLLB(_) => None,
             Self::Marian(_) => None,
@@ -2111,6 +2151,12 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 Some(vocab.token_to_id(vocab.get_bos_value()))
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer
+                .special_token_map
+                .bos_token
+                .as_ref()
+                .map(|token| tokenizer.token_to_id(token)),
             Self::MBart50(_) => Some(0),
             Self::FNet(_) => None,
             Self::Bert(_) => None,
@@ -2186,6 +2232,12 @@ impl TokenizerOption {
                 let vocab = MultiThreadedTokenizer::vocab(tokenizer);
                 Some(vocab.token_to_id(vocab.get_eos_value()))
             }
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref tokenizer) => tokenizer
+                .special_token_map
+                .eos_token
+                .as_ref()
+                .map(|token| tokenizer.token_to_id(token)),
             Self::FNet(_) => None,
             Self::Bert(_) => None,
             Self::ProphetNet(_) => None,
@@ -2264,6 +2316,8 @@ impl TokenizerOption {
             Self::M2M100(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
             Self::NLLB(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
             Self::FNet(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref mut tokenizer) => tokenizer.add_extra_ids(num_extra_ids),
         }
     }
 
@@ -2289,6 +2343,8 @@ impl TokenizerOption {
             Self::M2M100(ref mut tokenizer) => tokenizer.add_tokens(tokens),
             Self::NLLB(ref mut tokenizer) => tokenizer.add_tokens(tokens),
             Self::FNet(ref mut tokenizer) => tokenizer.add_tokens(tokens),
+            #[cfg(feature = "hf-tokenizers")]
+            Self::HFTokenizer(ref mut tokenizer) => tokenizer.add_tokens(tokens),
         }
     }
 }