Skip to content

Commit

Permalink
Better sub-batching when using BERT
Browse files Browse the repository at this point in the history
  • Loading branch information
nikitakit committed Jun 28, 2019
1 parent 9864130 commit 9ec778f
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion src/parse_nk.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,13 @@ def from_spec(cls, spec, model):
return res

def split_batch(self, sentences, golds, subbatch_max_tokens=3000):
lens = [len(sentence) + 2 for sentence in sentences]
if self.bert is not None:
lens = [
len(self.bert_tokenizer.tokenize(' '.join([word for (_, word) in sentence]))) + 2
for sentence in sentences
]
else:
lens = [len(sentence) + 2 for sentence in sentences]

lens = np.asarray(lens, dtype=int)
lens_argsort = np.argsort(lens).tolist()
Expand Down

0 comments on commit 9ec778f

Please sign in to comment.