Merge pull request nomic-ai#268 from MalikMAlna/dev

Slight cleanup
Semi-0 · Apr 7, 2023 · 8e28a33 · 8e28a33
2 parents 7d06b4c + 43ddc3e
commit 8e28a33
Showing 1 changed file with 3 additions and 4 deletions.
diff --git a/data.py b/data.py
@@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples):
 
         # add target tokens, remove bos
         input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
-        # add eos token, enforce stopping if we don't truncate 
+        # add eos token; ensure generation stops if inputs aren't truncated
         # we don't want long code to stop generating if truncated during training
         if newline_plus_inputs + len(target_tokens) < max_length:
             input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id
@@ -57,7 +57,6 @@ def load_data(config, tokenizer):
     dataset_path = config["dataset_path"]
 
     if os.path.exists(dataset_path):
-        # check if path is a directory
         if os.path.isdir(dataset_path):
             files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
         else:
@@ -68,7 +67,7 @@ def load_data(config, tokenizer):
         dataset = load_dataset("json", data_files=files, split="train")
 
     else:
-        dataset = load_dataset(dataset_path,split='train')
+        dataset = load_dataset(dataset_path, split="train")
 
     dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
 
@@ -87,7 +86,7 @@ def load_data(config, tokenizer):
         **kwargs
     )
     val_dataset = val_dataset.map(
-        lambda ele: tokenize_inputs(config, tokenizer, ele), 
+        lambda ele: tokenize_inputs(config, tokenizer, ele),
         batched=True,
         remove_columns=["source", "prompt"],
         **kwargs