Skip to content

Commit

Permalink
update bpe tokenizer, lower-case special tokens (facebookresearch#994)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexholdenmiller authored Jul 26, 2018
1 parent 13df86d commit fea6d26
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 142 deletions.
5 changes: 4 additions & 1 deletion parlai/agents/seq2seq/seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ def __init__(self, opt, shared=None):
start_idx=self.START_IDX, end_idx=self.END_IDX,
longest_label=states.get('longest_label', 1))

if not states and opt['embedding_type'] != 'random':
if opt.get('dict_tokenizer') == 'bpe' and opt['embedding_type'] != 'random':
print('skipping preinitialization of embeddings for bpe')
elif not states and opt['embedding_type'] != 'random':
# set up preinitialized embeddings
try:
import torchtext.vocab as vocab
Expand Down Expand Up @@ -547,6 +549,7 @@ def vectorize(self, observations):
observations, self.dict, end_idx=self.END_IDX,
null_idx=self.NULL_IDX, dq=True, eval_labels=True,
truncate=self.truncate)

if xs is None:
return None, None, None, None, None, None, None
xs = torch.LongTensor(xs)
Expand Down
Loading

0 comments on commit fea6d26

Please sign in to comment.