Skip to content

Commit

Permalink
Remove input_ids_no_response from instruction finetuning scripts (L…
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli authored Dec 18, 2023
1 parent f40b32e commit 742f9a7
Show file tree
Hide file tree
Showing 6 changed files with 1 addition and 12 deletions.
1 change: 0 additions & 1 deletion scripts/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
return {
**example,
"input_ids": encoded_full_prompt_and_response,
"input_ids_no_response": encoded_full_prompt,
"labels": labels,
}

Expand Down
1 change: 0 additions & 1 deletion scripts/prepare_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
return {
**example,
"input_ids": encoded_full_prompt_and_response,
"input_ids_no_response": encoded_full_prompt,
"labels": labels,
}

Expand Down
3 changes: 1 addition & 2 deletions scripts/prepare_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def download_if_missing(file_path: Path, file_url: str) -> None:
f.write(requests.get(file_url).text)


def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> None:
def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
"""Processes a single sample.
Each sample in the dataset consists of:
Expand Down Expand Up @@ -127,7 +127,6 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
return {
**example,
"input_ids": encoded_full_prompt_and_response,
"input_ids_no_response": encoded_full_prompt,
"labels": labels,
}

Expand Down
1 change: 0 additions & 1 deletion scripts/prepare_lima.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
return {
**example,
"input_ids": encoded_full_prompt_and_response,
"input_ids_no_response": encoded_full_prompt,
"labels": labels,
}

Expand Down
1 change: 0 additions & 1 deletion scripts/prepare_longform.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
return {
**example,
"input_ids": encoded_full_prompt_and_response,
"input_ids_no_response": encoded_full_prompt,
"labels": labels,
}

Expand Down
6 changes: 0 additions & 6 deletions tests/test_prepare_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,20 @@ def test_prepare_csv(tmp_path, fake_checkpoint_dir):
"input": "2+2",
"output": "4",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
{
"instruction": "Divide",
"input": "10/2",
"output": "5",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
{
"instruction": "Multiply",
"input": "6*4",
"output": "24",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
],
Expand All @@ -68,23 +65,20 @@ def test_prepare_csv(tmp_path, fake_checkpoint_dir):
"input": "2^3",
"output": "8",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
{
"instruction": "Subtract",
"input": "5-3",
"output": "2",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
{
"instruction": "Square root",
"input": "√9",
"output": "3",
"input_ids": ANY,
"input_ids_no_response": ANY,
"labels": ANY,
},
],
Expand Down

0 comments on commit 742f9a7

Please sign in to comment.