Add alpaca 2k (Lightning-AI#1027)

Co-authored-by: awaelchli <[email protected]>
girishc24 · Mar 15, 2024 · 71ba993 · 71ba993
1 parent b1423fa
commit 71ba993
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 1 deletion.
diff --git a/lit_gpt/data/__init__.py b/lit_gpt/data/__init__.py
@@ -2,6 +2,7 @@
 
 from lit_gpt.data.base import LitDataModule, SFTDataset, get_sft_collate_fn
 from lit_gpt.data.alpaca import Alpaca
+from lit_gpt.data.alpaca_2k import Alpaca2k
 from lit_gpt.data.alpaca_gpt4 import AlpacaGPT4
 from lit_gpt.data.json import JSON
 from lit_gpt.data.deita import Deita
@@ -16,6 +17,7 @@
 
 __all__ = [
     "Alpaca",
+    "Alpaca2k"
     "AlpacaGPT4",
     "Deita",
     "Dolly",

diff --git a/lit_gpt/data/alpaca_2k.py b/lit_gpt/data/alpaca_2k.py
@@ -0,0 +1,53 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from lit_gpt.data.alpaca import Alpaca
+from lit_gpt.data import SFTDataset
+
+
+@dataclass
+class Alpaca2k(Alpaca):
+    """Alpaca2k data module for supervised finetuning."""
+
+    test_split_fraction: float = 0.05  # to get exactly 100 test samples,
+    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    download_dir: Path = Path("./data/alpaca2k")
+    """The directory in which the downloaded datasetgets saved."""
+    repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
+    """The URL from where to download the dataset."""
+    file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json")
+    """The name of the dataset file to download."""
+
+    def prepare_data(self) -> None:
+        from datasets import load_dataset
+
+        load_dataset(self.repo_id, cache_dir=self.download_dir)
+
+    def setup(self, stage: str = "") -> None:
+        from datasets import load_dataset
+
+        dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)
+
+        train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
+        train_data = train_validation_split["train"]
+        test_data = train_validation_split["test"]
+
+        self.train_dataset = SFTDataset(
+            data=train_data,
+            tokenizer=self.tokenizer,
+            prompt_style=self.prompt_style,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        self.test_dataset = SFTDataset(
+            data=test_data,
+            tokenizer=self.tokenizer,
+            prompt_style=self.prompt_style,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+
diff --git a/tutorials/images/prepare_dataset/alpaca-2k.jpg b/tutorials/images/prepare_dataset/alpaca-2k.jpg
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
@@ -5,9 +5,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT:
 | Name         | Task        | Size                 | Reference Repo                                                    | Paper / Blog                                                                                                              | Data License                                                                                                                                                                                                     |
 |--------------|-------------|----------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Alpaca       | Finetuning  | 51,759 samples       | [URL](https://github.com/tatsu-lab/stanford_alpaca)               | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                   | Attribution-NonCommercial 4.0 International, [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                                                             |
+| Alpaca-2k    | Finetuning  | 2000 samples         | [URL](https://huggingface.co/datasets/mhenrichsen/alpaca_2k_test) |  See Alpaca above                                                                                                         |  See Alpaca Above                                                                                                                                                                                                |
 | Alpaca-GPT4  | Finetuning  | 52,002 samples       | [URL](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) | [URL](https://arxiv.org/abs/2304.03277)                                                                                   | Attribution-NonCommercial 4.0 International, [URL](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/DATA_LICENSEl)                                                                           |
 | Alpaca Libre | Finetuning  | 55,370 samples       | [URL](https://github.com/mobarski/alpaca-libre)                   | -                                                                                                                         | CC0/MIT,  [URL](https://github.com/mobarski/alpaca-libre)                                                                                                                                                        |
-| Deita        | Finetuning  | 9,500 samples         | [URL](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft/tree/main/data)        | [URL](https://arxiv.org/abs/2312.15685)                                                                                   | MIT [URL](https://huggingface.co/datasets/hkust-nlp/deita-10k-v0/blob/main/README.md)
+| Deita        | Finetuning  | 9,500 samples        | [URL](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft/tree/main/data)        | [URL](https://arxiv.org/abs/2312.15685)                                                                                   | MIT [URL](https://huggingface.co/datasets/hkust-nlp/deita-10k-v0/blob/main/README.md)                                                                                                  |
 | Dolly        | Finetuning  | 15,011 samples       | [URL](https://github.com/databrickslabs/dolly/tree/master/data)   | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)              | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview)                                                                                                                                          |
 | FLAN         | Finetuning  | 1,753,240 samples    | [UR](https://huggingface.co/datasets/Muennighoff/flan)            | [URL](https://blog.research.google/2023/02/the-flan-collection-advancing-open.html)                                       | Subset dependent                                                                                                                                                                                                 |
 | LongForm     | Finetuning  | 23,652 samples       | [URL](https://github.com/akoksal/LongForm)                        | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm)                                                                                                                         |
@@ -70,6 +71,12 @@ For comparison, the Falcon 7B model requires 23.52 GB of memory for the original
 
 &nbsp;
 
+### Alpaca-2k
+
+[Alpaca-2k](https://huggingface.co/datasets/mhenrichsen/alpaca_2k_test) is a smaller, 2000-sample subset of Alpaca described above.
+
+<img src="images/prepare_dataset/alpaca-2k.jpg" width=400px>
+
 ### Alpaca-GPT4