Skip to content

Commit

Permalink
whitespace-only splitting everywhere.
Browse files Browse the repository at this point in the history
  • Loading branch information
rsennrich committed Apr 25, 2018
1 parent 9a95f9f commit d40de91
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
10 changes: 8 additions & 2 deletions apply_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None
self.version = (0, 1)
codes.seek(0)

self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]
self.bpe_codes = [tuple(item.strip().split(' ')) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

for item in self.bpe_codes:
if len(item) != 2:
sys.stderr.write('Error: invalid line in BPE codes file: {0}\n'.format(' '.join(item)))
sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n'.format(' '.join(item)))
sys.exit(1)

# some hacking to deal with duplicates (only consider first instance)
self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
Expand Down Expand Up @@ -258,7 +264,7 @@ def read_vocabulary(vocab_file, threshold):
vocabulary = set()

for line in vocab_file:
word, freq = line.split()
word, freq = line.strip().split(' ')
freq = int(freq)
if threshold == None or freq >= threshold:
vocabulary.add(word)
Expand Down
2 changes: 1 addition & 1 deletion learn_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def replace_pair(pair, vocab, indices):
word, freq = vocab[j]
new_word = ' '.join(word)
new_word = pattern.sub(pair_str, new_word)
new_word = tuple(new_word.split())
new_word = tuple(new_word.split(' '))

vocab[j] = (new_word, freq)
changes.append((j, new_word, word, freq))
Expand Down

0 comments on commit d40de91

Please sign in to comment.