Skip to content

Commit

Permalink
fix vocab encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
www committed Sep 4, 2023
1 parent 5d5b456 commit b8c237d
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion RWKV-v4neo/src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(self, args):
for u in unique:
xxObj[xx] = u
xx += 1
with open(f"{args.proj_dir}/vocab.json", "w", encoding="utf-16le") as vocab_file:
with open(f"{args.proj_dir}/vocab.json", "w", encoding="utf-8") as vocab_file:
vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
self.data_size = len(self.data)
rank_zero_info(f"Data has {self.data_size} tokens, {self.vocab_size} vocab size.")
Expand Down

0 comments on commit b8c237d

Please sign in to comment.