Skip to content

Commit

Permalink
Add alpaca 2k (Lightning-AI#1027)
Browse files Browse the repository at this point in the history
Co-authored-by: awaelchli <[email protected]>
  • Loading branch information
rasbt and awaelchli committed Mar 15, 2024
1 parent b1423fa commit 71ba993
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 1 deletion.
2 changes: 2 additions & 0 deletions lit_gpt/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from lit_gpt.data.base import LitDataModule, SFTDataset, get_sft_collate_fn
from lit_gpt.data.alpaca import Alpaca
from lit_gpt.data.alpaca_2k import Alpaca2k
from lit_gpt.data.alpaca_gpt4 import AlpacaGPT4
from lit_gpt.data.json import JSON
from lit_gpt.data.deita import Deita
Expand All @@ -16,6 +17,7 @@

__all__ = [
"Alpaca",
"Alpaca2k"
"AlpacaGPT4",
"Deita",
"Dolly",
Expand Down
53 changes: 53 additions & 0 deletions lit_gpt/data/alpaca_2k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.


from dataclasses import dataclass, field
from pathlib import Path
from lit_gpt.data.alpaca import Alpaca
from lit_gpt.data import SFTDataset


@dataclass
class Alpaca2k(Alpaca):
"""Alpaca2k data module for supervised finetuning."""

test_split_fraction: float = 0.05 # to get exactly 100 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
download_dir: Path = Path("./data/alpaca2k")
"""The directory in which the downloaded datasetgets saved."""
repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
"""The URL from where to download the dataset."""
file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json")
"""The name of the dataset file to download."""

def prepare_data(self) -> None:
from datasets import load_dataset

load_dataset(self.repo_id, cache_dir=self.download_dir)

def setup(self, stage: str = "") -> None:
from datasets import load_dataset

dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)

train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
train_data = train_validation_split["train"]
test_data = train_validation_split["test"]

self.train_dataset = SFTDataset(
data=train_data,
tokenizer=self.tokenizer,
prompt_style=self.prompt_style,
max_seq_length=self.max_seq_length,
mask_prompt=self.mask_prompt,
ignore_index=self.ignore_index,
)
self.test_dataset = SFTDataset(
data=test_data,
tokenizer=self.tokenizer,
prompt_style=self.prompt_style,
max_seq_length=self.max_seq_length,
mask_prompt=self.mask_prompt,
ignore_index=self.ignore_index,
)

Binary file added tutorials/images/prepare_dataset/alpaca-2k.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 8 additions & 1 deletion tutorials/prepare_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT:
| Name | Task | Size | Reference Repo | Paper / Blog | Data License |
|--------------|-------------|----------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Alpaca | Finetuning | 51,759 samples | [URL](https://github.com/tatsu-lab/stanford_alpaca) | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) | Attribution-NonCommercial 4.0 International, [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) |
| Alpaca-2k | Finetuning | 2000 samples | [URL](https://huggingface.co/datasets/mhenrichsen/alpaca_2k_test) | See Alpaca above | See Alpaca Above |
| Alpaca-GPT4 | Finetuning | 52,002 samples | [URL](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) | [URL](https://arxiv.org/abs/2304.03277) | Attribution-NonCommercial 4.0 International, [URL](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/DATA_LICENSEl) |
| Alpaca Libre | Finetuning | 55,370 samples | [URL](https://github.com/mobarski/alpaca-libre) | - | CC0/MIT, [URL](https://github.com/mobarski/alpaca-libre) |
| Deita | Finetuning | 9,500 samples | [URL](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft/tree/main/data) | [URL](https://arxiv.org/abs/2312.15685) | MIT [URL](https://huggingface.co/datasets/hkust-nlp/deita-10k-v0/blob/main/README.md)
| Deita | Finetuning | 9,500 samples | [URL](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft/tree/main/data) | [URL](https://arxiv.org/abs/2312.15685) | MIT [URL](https://huggingface.co/datasets/hkust-nlp/deita-10k-v0/blob/main/README.md) |
| Dolly | Finetuning | 15,011 samples | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview) |
| FLAN | Finetuning | 1,753,240 samples | [UR](https://huggingface.co/datasets/Muennighoff/flan) | [URL](https://blog.research.google/2023/02/the-flan-collection-advancing-open.html) | Subset dependent |
| LongForm | Finetuning | 23,652 samples | [URL](https://github.com/akoksal/LongForm) | [URL](https://arxiv.org/abs/2304.08460) | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) |
Expand Down Expand Up @@ -70,6 +71,12 @@ For comparison, the Falcon 7B model requires 23.52 GB of memory for the original

&nbsp;

### Alpaca-2k

[Alpaca-2k](https://huggingface.co/datasets/mhenrichsen/alpaca_2k_test) is a smaller, 2000-sample subset of Alpaca described above.

<img src="images/prepare_dataset/alpaca-2k.jpg" width=400px>

### Alpaca-GPT4


Expand Down

0 comments on commit 71ba993

Please sign in to comment.