Skip to content

Commit

Permalink
Add trim_offsets tokenizer param
Browse files Browse the repository at this point in the history
  • Loading branch information
minimaxir committed Dec 25, 2020
1 parent 8bf2539 commit 6fc77f9
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion aitextgen/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def train_tokenizer(
eos_token: str = "<|endoftext|>",
unk_token: str = "<|endoftext|>",
serialize: bool = True,
trim_offsets: bool = True,
) -> None:
"""
Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
Expand Down Expand Up @@ -42,7 +43,7 @@ def train_tokenizer(
if isinstance(files, str):
files = [files]

tokenizer = ByteLevelBPETokenizer(dropout=dropout)
tokenizer = ByteLevelBPETokenizer(dropout=dropout, trim_offsets=trim_offsets)

tokenizer.train(
files=files,
Expand Down

0 comments on commit 6fc77f9

Please sign in to comment.