Skip to content

Commit

Permalink
Merge pull request FlagAI-Open#60 from baai-open-internal/fix_glm_tok…
Browse files Browse the repository at this point in the history
…enizer

Fix glm tokenizer
  • Loading branch information
marscrazy authored Aug 29, 2022
2 parents 35b5d9a + 56c081f commit c3c3569
Showing 1 changed file with 10 additions and 13 deletions.
23 changes: 10 additions & 13 deletions flagai/data/tokenizer/uni_tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,15 @@
from typing import List, Union, Optional
import unicodedata


def is_control(ch):
"""控制类字符判断
"""
https://en.wikipedia.org/wiki/Control_character
https://www.fileformat.info/info/unicode/category/Cc/index.htm
https://www.fileformat.info/info/unicode/category/Cf/index.htm
"""
return unicodedata.category(ch) in ('Cc', 'Cf')



class Tokenizer(BaseTokenizer):
def __init__(self,
add_block_symbols=True,
Expand All @@ -56,7 +53,7 @@ def __init__(self,
if self.tokenizer_class == "wp":
self.text_tokenizer = WordpieceTokenizer(self.vocab_file)
elif self.tokenizer_class == "bpe":
if self.tokenizer_model_name.startswith('clip'):
if self.tokenizer_model_name.lower().startswith('clip'):
self.text_tokenizer = MMBPETokenizer(self.vocab_file, self.merges_file)
else:
self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file)
Expand All @@ -65,7 +62,7 @@ def __init__(self,
else:
raise NotImplementedError("cannot assign a tokenize class")

self.is_glm = self.tokenizer_model_name.startswith('GLM')
self.is_glm = self.tokenizer_model_name.lower().startswith('glm')
# self.is_clip = self.tokenizer_model_name.startswith('clip')
self.num_tokens = self.text_tokenizer.vocab_size

Expand Down Expand Up @@ -125,7 +122,7 @@ def __init__(self,
self.num_tokens += 2
self.num_command_tokens += 2
elif self.tokenizer_class == "bpe":
if self.tokenizer_model_name.startswith('roberta'):
if self.tokenizer_model_name.lower().startswith('roberta'):
self.num_command_tokens = 6
self.num_text_tokens = self.num_tokens - 3
self._command_tokens = [
Expand All @@ -151,7 +148,7 @@ def __init__(self,
])
self.num_tokens += 2
self.num_command_tokens += 2
elif self.tokenizer_model_name.startswith('clip'):
elif self.tokenizer_model_name.lower().startswith('clip'):
self.num_command_tokens = 2
self._command_tokens = [
CommandToken('sot', '<start_of_text>',
Expand All @@ -170,7 +167,7 @@ def __init__(self,
self.text_tokenizer.convert_token_to_id('<|endoftext|>'))
]
if add_block_symbols:
if self.tokenizer_model_name.startswith('GLM'):
if self.tokenizer_model_name.lower().startswith('glm'):
unk_token_id = self.num_tokens + 5
cls_token_id = self.num_tokens + 2
num_tokens_to_add = 5
Expand Down Expand Up @@ -215,7 +212,7 @@ def __init__(self,
self.num_text_tokens = self.text_tokenizer.vocab_size
self.num_tokens = self.num_text_tokens

if self.tokenizer_model_name.startswith('GLM'):
if self.tokenizer_model_name.lower().startswith('glm'):
pad_token_id = self.num_tokens
eos_token_id = self.num_tokens
unk_token_id = self.num_tokens + 4
Expand Down Expand Up @@ -450,7 +447,6 @@ def CommandTokenIds(self, exception=None):
result.append(s.Id)
return (result)


def encode_plus_non_glm(
self,
text,
Expand All @@ -474,7 +470,6 @@ def get_input_ids(text):
max_length=max_length,
)


def prepare_for_model(
self,
ids: List[int],
Expand Down Expand Up @@ -517,7 +512,7 @@ def encode_plus( #for Seq2seq
truncation=True,
max_length=None,
):
if not self.tokenizer_model_name.startswith("GLM"):
if not self.tokenizer_model_name.lower().startswith("glm"):
return self.encode_plus_non_glm(source_text, second_text, truncation, max_length)
sop_id = self.get_command_id('sop') #start of piece
eop_id = self.get_command_id('eop') #end of piece
Expand Down Expand Up @@ -595,7 +590,9 @@ def tokenize_as_tensor(self, texts):
eot_token = self.get_command_id('eot')
return self.text_tokenizer.tokenize(texts, sot_token=sot_token, eot_token=eot_token)


def tokenize(self, texts):
return self.text_tokenizer.tokenize(texts)



0 comments on commit c3c3569

Please sign in to comment.