Merge pull request nomic-ai#335 from nomic-ai/gptj

GPT-J
cntoby · Apr 13, 2023 · 51264f5 · 51264f5
2 parents ed53fe1 + a0fe480
commit 51264f5
Show file tree

Hide file tree

Showing 33 changed files with 809 additions and 203 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+*.pkl
+ckpts*
+.deepspeed_env
 *.jsonl
 *tar.gz
 ckpts**

diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "transformers"]
-	path = transformers
-	url = https://github.com/huggingface/transformers.git
 [submodule "peft"]
 	path = peft
 	url = https://github.com/huggingface/peft.git
diff --git a/GPT-J_MAP.md b/GPT-J_MAP.md
@@ -0,0 +1,17 @@
+# Inference on Training Data
+
+
+## Run Inference
+
+```bash
+torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
+```
+
+
+## Visualizations
+
+```bash
+python build_map.py
+```
+
+will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.
diff --git a/README.md b/README.md
@@ -1,8 +1,11 @@
 <h1 align="center">GPT4All</h1>
-<p align="center">Demo, data, and code to train an assistant-style large language model with ~800k GPT-3.5-Turbo Generations based on LLaMa</p>
+<p align="center">Demo, data, and code to train open-source assistant-style large language model based on GPT-J and LLaMa</p>
+<p align="center">
+<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All-J_Technical_Report_2.pdf">:green_book: Technical Report 2: GPT4All-J </a>
+</p>
 
 <p align="center">
-<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report</a>
+<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report 1: GPT4All</a>
 </p>
 
 <p align="center">
@@ -13,6 +16,23 @@
 <a href="https://github.com/nomic-ai/gpt4all-ts">:computer: Official Typescript Bindings</a>
 </p>
 
+<p align="center">
+<a href="https://github.com/nomic-ai/gpt4all-ui">:speech_balloon: Official Web Chat Interface</a>
+</p>
+
+<p align="center">
+<a href="https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html">🦜️🔗 Official Langchain Backend</a> 
+</p>
+
+
+<p align="center">
+<a href="https://discord.gg/mGZE39AS3e">Discord</a>
+</p>
+
+<p align="center">
+<a href="https://github.com/nomic-ai/gpt4all-ts">:computer: Official Typescript Bindings</a>
+</p>
+
 <p align="center">
 <a href="https://github.com/nomic-ai/gpt4all-ui">:speech_balloon: Official Chat Interface</a>
 </p>
@@ -27,6 +47,56 @@
 </p>
 
 
+<p align="center">
+GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
+</p>
+
+
+
+## GPT4All-J: An Apache-2 Licensed GPT4All Model
+![gpt4all-j-demo](https://user-images.githubusercontent.com/13879686/231876409-e3de1934-93bb-4b4b-9013-b491a969ebbc.gif)
+
+Run runs on an M1 Mac (not sped up!)
+
+
+### GPT4All-J Chat UI Installers
+Installs a native chat-client with auto-update functionality that runs on your desktop with the GPT4All-J model baked into it.
+
+[Mac/OSX](https://gtp4all.io/installers/gpt4all-0.1.0-Darwin.dmg)
+
+[Windows](https://gpt4all.io/installers/gpt4all-0.1.0-win64.exe)
+
+[Ubuntu](https://gpt4all.io/installers/gpt4all-0.1.0-Linux.run)
+
+These files are not yet cert signed by Windows/Apple so you will see security warnings on initial installation. We did not want to delay release while waiting for their process to complete.
+
+Find the most up-to-date information on the [GPT4All Website](https://gpt4all.io/)
+
+### Raw Model
+[ggml Model Download Link](https://gpt4all.io/ggml-gpt4all-j.bin)
+
+Note this model is only compatible with the C++ bindings found [here](https://github.com/nomic-ai/gpt4all-chat). It will not work with any existing llama.cpp bindings as we had to do a large fork of llama.cpp. GPT4All will support the ecosystem around this new C++ backend going forward.
+
+Python bindings are imminent and will be integrated into this [repository](https://github.com/nomic-ai/pyllamacpp). Stay tuned on the [GPT4All discord](https://discord.gg/mGZE39AS3e) for updates.
+
+## Training GPT4All-J
+
+Please see [GPT4All-J Technical Report]() for details.
+
+### GPT4All-J Training Data
+
+- We are releasing the curated training data for anyone to replicate GPT4All-J here: [GPT4All-J Training Data](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)
+   - [Atlas Map of Prompts](https://atlas.nomic.ai/map/gpt4all-j-prompts-curated)
+   - [Atlas Map of Responses](https://atlas.nomic.ai/map/gpt4all-j-response-curated)
+
+### GPT4All-J Training Instructions
+
+```bash
+accelerate launch --dynamo_backend=inductor --num_processes=8 --num_machines=1 --machine_rank=0 --deepspeed_multinode_launcher standard --mixed_precision=bf16  --use_deepspeed --deepspeed_config_file=configs/deepspeed/ds_config_gptj.json train.py --config configs/train/finetune_gptj.yaml
+```
+
+
+# Original GPT4All Model (based on GPL Licensed LLaMa)
 
 
 
@@ -113,16 +183,16 @@ Feel free to convert this to a more structured table.
 
 # Roadmap
 ## Short Term
- - <span style="color:green">(IN PROGRESS)</span> Train a GPT4All model based on GPTJ to alleviate llama distribution issues.
- - <span style="color:green">(IN PROGRESS)</span> Create improved CPU and GPU interfaces for this model.
+ - <span style="color:green">(Done)</span> Train a GPT4All model based on GPTJ to alleviate llama distribution issues.
+ - <span style="color:green">(Done)</span> Create improved CPU and GPU interfaces for this model.
  - <span style="color:green">(Done)</span> [Integrate llama.cpp bindings](https://github.com/nomic-ai/pyllamacpp)
  - <span style="color:green">(Done)</span> [Create a good conversational chat interface for the model.](https://github.com/nomic-ai/gpt4all-ui)
  - <span style="color:green">(Done)</span> [Allow users to opt in and submit their chats for subsequent training runs](https://github.com/nomic-ai/gpt4all-ui)
 
 ## Medium Term
  - <span style="color:red">(NOT STARTED)</span> Integrate GPT4All with [Atlas](https://atlas.nomic.ai) to allow for document retrieval.
    - BLOCKED by GPT4All based on GPTJ
- - <span style="color:red">(NOT STARTED)</span> Integrate GPT4All with Langchain.
+ - <span style="color:red">(Done)</span> Integrate GPT4All with Langchain.
  - <span style="color:green">(IN PROGRESS)</span> Build easy custom training scripts to allow users to fine tune models.
 
 ## Long Term
@@ -131,9 +201,11 @@ Feel free to convert this to a more structured table.
 
 # Reproducibility
 
-Trained LoRa Weights:
+Trained Model Weights:
 - gpt4all-lora (four full epochs of training):  https://huggingface.co/nomic-ai/gpt4all-lora
 - gpt4all-lora-epoch-2 (three full epochs of training) https://huggingface.co/nomic-ai/gpt4all-lora-epoch-2
+- gpt4all-j (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j)
+- gpt4all-j-lora (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j-lora)
 
 Raw Data:
 - [Training Data Without P3](https://huggingface.co/datasets/nomic-ai/gpt4all_prompt_generations)
@@ -159,9 +231,6 @@ Setup the environment
 ```
 python -m pip install -r requirements.txt
 
-cd transformers
-pip install -e . 
-
 cd ../peft
 pip install -e .
 ```

diff --git a/TRAINING_LOG.md b/TRAINING_LOG.md
@@ -23,7 +23,7 @@ We used the initial parameters:
 | Weight decay   | 0     |
 | Warmup Steps   | 100   |
 
-We randomly shuffle and set aside %5 of the data for validation.
+We randomly shuffle and set aside 5% of the data for validation.
 
 We had an initial bug in logging the training loss but we noticed a decrease in validation loss.
 
@@ -235,3 +235,49 @@ Taking inspiration from [the Alpaca Repo](https://github.com/tatsu-lab/stanford_
 Comparing our model LoRa to the [Alpaca LoRa](https://huggingface.co/tloen/alpaca-lora-7b), our model has lower perplexity. Qualitatively, training on 3 epochs performed the best on perplexity as well as qualitative examples. 
 
 We tried training a full model using the parameters above, but found that during the second epoch the model diverged and samples generated post training were worse than the first epoch. 
+
+
+## GPT-J Training
+
+### Model Training Divergence
+
+We trained multiple [GPT-J models](https://huggingface.co/EleutherAI/gpt-j-6b) with varying success. We found that training the full model lead to diverged post epoch 1. ![](figs/overfit-gpt-j.png)
+
+
+We release the checkpoint after epoch 1.
+
+
+Using Atlas, we extracted the embeddings of each point in the dataset and calculated the loss per sequence. We then uploaded [this to Atlas](https://atlas.nomic.ai/map/gpt4all-j-post-epoch-1-embeddings) and noticed that the higher loss items seem to cluster. On further inspection, the highest density clusters seemded to be of prompt/response pairs that asked for creative-like generations such as `Generate a story about ...` ![](figs/clustering_overfit.png)
+
+
+
+### GPT4All-J Hyperparameters
+
+We varied learning rate, learning rate schedule, and weight decay following suggestions from the [original GPT-J codebase](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md) but found no real performance difference (qualitatively or quantitatively) when varying these parameters.
+
+
+
+The final model was trained using the following hyperparameters with a linear warmup followed by constant learning rate:
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Per Device BS  | 32    |
+| Global BS      | 256   |
+| Learning rate  | 2e-5  |
+| Epochs         | 2     |
+| Max length     | 1024  |
+| Weight decay   | 0     |
+| Warmup Steps   | 500   |
+
+
+The LoRA model was trained using using the following hyperparameters with a linear warmup followed by constant learning rate: 
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Per Device BS  | 4     |
+| Global BS      | 32    |
+| Learning rate  | 2e-5  |
+| Epochs         | 2     |
+| Max length     | 1024  |
+| Weight decay   | 0     |
+| Warmup Steps   | 500   |
diff --git a/build_map.py b/build_map.py
@@ -0,0 +1,54 @@
+import numpy as np
+from nomic import atlas
+import glob
+from tqdm import tqdm
+from datasets import load_dataset, concatenate_datasets
+from sklearn.decomposition import PCA
+
+files = glob.glob("inference/*.jsonl")
+print(files)
+df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
+
+print(len(df))
+print(df)
+
+df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
+            batched=True,
+            num_proc=64)
+
+df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
+                batched=True,
+                num_proc=64)
+
+df = df.remove_columns("is_train")
+
+text = df.remove_columns(["labels", "input_ids", "embeddings"])
+
+text_df = [text[i] for i in range(len(text))]
+
+atlas.map_text(text_df, indexed_field="inputs",
+               name="CHANGE ME!",
+               colorable_fields=["source", "loss", "trained_on"],
+               reset_project_if_exists=True,
+               )
+
+# index is local to train/test split, regenerate
+data = df.remove_columns(["labels", "input_ids", "index"])
+data = data.add_column("index", list(range(len(data))))
+# max embed dim is 2048 for now
+# note! this is slow in pyarrow/hf datasets
+embeddings = np.array(data["embeddings"])
+print("embeddings shape:", embeddings.shape)
+embeddings = PCA(n_components=2048).fit_transform(embeddings)
+
+data = data.remove_columns(["embeddings"])
+columns = data.to_pandas().to_dict("records") 
+
+atlas.map_embeddings(embeddings,
+                     data=columns,
+                     id_field="index",
+                     name="CHANGE ME!",
+                     colorable_fields=["source", "loss", "trained_on"],
+                     build_topic_model=True,
+                     topic_label_field="inputs",
+                     reset_project_if_exists=True,)
diff --git a/clean.py b/clean.py
@@ -64,6 +64,7 @@
     df = df.dropna(subset=['prompt', 'response'])
     df = df[df['prompt'] != '']
     df = df[df['response'] != '']
+    df = df[df["prompt"].str.len() > 1]
     curr_len = len(df)
 
     print(f"Removed {prev_len - curr_len} rows")

diff --git a/configs/deepspeed/ds_config_gptj.json b/configs/deepspeed/ds_config_gptj.json
@@ -0,0 +1,48 @@
+{
+	"train_batch_size": "auto",
+	"gradient_accumulation_steps": "auto",
+	"train_micro_batch_size_per_gpu": "auto",
+	"fp16": {
+	  "enabled": "auto",
+	  "min_loss_scale": 1,
+	  "loss_scale_window": 1000,
+	  "hysteresis": 2,
+	  "initial_scale_power": 32
+	},
+	"bf16": {
+		"enabled": "auto"
+	},
+	"gradient_clipping": 1.0,
+	"zero_optimization": {
+	  "stage": 2,
+	  "offload_param": {
+		"device": "none"
+	  },
+	  "offload_optimizer": {
+		"device": "none"
+	  },
+	  "allgather_partitions": true,
+	  "allgather_bucket_size": 5e8,
+	  "contiguous_gradients": true
+	},
+	"optimizer": {
+		"type": "AdamW",
+		"params": {
+		  "lr": "auto",
+		  "betas": [
+			0.9,
+			0.999
+		  ],
+		  "eps": 1e-08
+		}
+	  },
+	  "scheduler": {
+		"type": "WarmupLR",
+		"params": {
+		  "warmup_min_lr": 0,
+		  "warmup_max_lr": "auto",
+		  "warmup_num_steps": "auto",
+		  "warmup_type": "linear"
+		}
+	  }
+}
diff --git a/configs/deepspeed/ds_config_gptj_lora.json b/configs/deepspeed/ds_config_gptj_lora.json
@@ -0,0 +1,48 @@
+{
+	"train_batch_size": "auto",
+	"gradient_accumulation_steps": "auto",
+	"train_micro_batch_size_per_gpu": "auto",
+	"fp16": {
+	  "enabled": "auto",
+	  "min_loss_scale": 1,
+	  "loss_scale_window": 1000,
+	  "hysteresis": 2,
+	  "initial_scale_power": 32
+	},
+	"bf16": {
+		"enabled": "auto"
+	},
+	"gradient_clipping": 1,
+	"zero_optimization": {
+	  "stage": 2,
+	  "offload_param": {
+		"device": "cpu"
+	  },
+	  "offload_optimizer": {
+		"device": "cpu"
+	  },
+	  "allgather_partitions": true,
+	  "allgather_bucket_size": 5e8,
+	  "contiguous_gradients": true
+	},
+	"optimizer": {
+	  "type": "AdamW",
+	  "params": {
+		"lr": "auto",
+		"betas": [
+		  0.9,
+		  0.999
+		],
+		"eps": 1e-08
+	  }
+	},
+	"scheduler": {
+	  "type": "WarmupLR",
+	  "params": {
+		"warmup_min_lr": 0,
+		"warmup_max_lr": "auto",
+		"warmup_num_steps": "auto",
+		"warmup_type": "linear"
+	  }
+	}
+  }
-Original file line number
+Diff line change
@@ -1,3 +1,6 @@
+    *.pkl
+    ckpts*
+    .deepspeed_env
     *.jsonl
     *tar.gz
     ckpts**
@@ Expand Down @@