Skip to content

Commit

Permalink
Merge pull request nomic-ai#268 from MalikMAlna/dev
Browse files Browse the repository at this point in the history
Slight cleanup
  • Loading branch information
AndriyMulyar authored Apr 7, 2023
2 parents 7d06b4c + 43ddc3e commit 8e28a33
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples):

# add target tokens, remove bos
input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
# add eos token, enforce stopping if we don't truncate
# add eos token; ensure generation stops if inputs aren't truncated
# we don't want long code to stop generating if truncated during training
if newline_plus_inputs + len(target_tokens) < max_length:
input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id
Expand All @@ -57,7 +57,6 @@ def load_data(config, tokenizer):
dataset_path = config["dataset_path"]

if os.path.exists(dataset_path):
# check if path is a directory
if os.path.isdir(dataset_path):
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
else:
Expand All @@ -68,7 +67,7 @@ def load_data(config, tokenizer):
dataset = load_dataset("json", data_files=files, split="train")

else:
dataset = load_dataset(dataset_path,split='train')
dataset = load_dataset(dataset_path, split="train")

dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])

Expand All @@ -87,7 +86,7 @@ def load_data(config, tokenizer):
**kwargs
)
val_dataset = val_dataset.map(
lambda ele: tokenize_inputs(config, tokenizer, ele),
lambda ele: tokenize_inputs(config, tokenizer, ele),
batched=True,
remove_columns=["source", "prompt"],
**kwargs
Expand Down

0 comments on commit 8e28a33

Please sign in to comment.