Add trim_offsets tokenizer param

dbanys · Dec 25, 2020 · 6fc77f9 · 6fc77f9
1 parent 8bf2539
commit 6fc77f9
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/aitextgen/tokenizers.py b/aitextgen/tokenizers.py
@@ -14,6 +14,7 @@ def train_tokenizer(
     eos_token: str = "<|endoftext|>",
     unk_token: str = "<|endoftext|>",
     serialize: bool = True,
+    trim_offsets: bool = True,
 ) -> None:
     """
     Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
@@ -42,7 +43,7 @@ def train_tokenizer(
     if isinstance(files, str):
         files = [files]
 
-    tokenizer = ByteLevelBPETokenizer(dropout=dropout)
+    tokenizer = ByteLevelBPETokenizer(dropout=dropout, trim_offsets=trim_offsets)
 
     tokenizer.train(
         files=files,