forked from nomic-ai/gpt4all
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request nomic-ai#335 from nomic-ai/gptj
GPT-J
- Loading branch information
Showing
33 changed files
with
809 additions
and
203 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
*.pkl | ||
ckpts* | ||
.deepspeed_env | ||
*.jsonl | ||
*tar.gz | ||
ckpts** | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,3 @@ | ||
[submodule "transformers"] | ||
path = transformers | ||
url = https://github.com/huggingface/transformers.git | ||
[submodule "peft"] | ||
path = peft | ||
url = https://github.com/huggingface/peft.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Inference on Training Data | ||
|
||
|
||
## Run Inference | ||
|
||
```bash | ||
torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml | ||
``` | ||
|
||
|
||
## Visualizations | ||
|
||
```bash | ||
python build_map.py | ||
``` | ||
|
||
will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import numpy as np | ||
from nomic import atlas | ||
import glob | ||
from tqdm import tqdm | ||
from datasets import load_dataset, concatenate_datasets | ||
from sklearn.decomposition import PCA | ||
|
||
files = glob.glob("inference/*.jsonl") | ||
print(files) | ||
df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)]) | ||
|
||
print(len(df)) | ||
print(df) | ||
|
||
df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]}, | ||
batched=True, | ||
num_proc=64) | ||
|
||
df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]}, | ||
batched=True, | ||
num_proc=64) | ||
|
||
df = df.remove_columns("is_train") | ||
|
||
text = df.remove_columns(["labels", "input_ids", "embeddings"]) | ||
|
||
text_df = [text[i] for i in range(len(text))] | ||
|
||
atlas.map_text(text_df, indexed_field="inputs", | ||
name="CHANGE ME!", | ||
colorable_fields=["source", "loss", "trained_on"], | ||
reset_project_if_exists=True, | ||
) | ||
|
||
# index is local to train/test split, regenerate | ||
data = df.remove_columns(["labels", "input_ids", "index"]) | ||
data = data.add_column("index", list(range(len(data)))) | ||
# max embed dim is 2048 for now | ||
# note! this is slow in pyarrow/hf datasets | ||
embeddings = np.array(data["embeddings"]) | ||
print("embeddings shape:", embeddings.shape) | ||
embeddings = PCA(n_components=2048).fit_transform(embeddings) | ||
|
||
data = data.remove_columns(["embeddings"]) | ||
columns = data.to_pandas().to_dict("records") | ||
|
||
atlas.map_embeddings(embeddings, | ||
data=columns, | ||
id_field="index", | ||
name="CHANGE ME!", | ||
colorable_fields=["source", "loss", "trained_on"], | ||
build_topic_model=True, | ||
topic_label_field="inputs", | ||
reset_project_if_exists=True,) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"train_batch_size": "auto", | ||
"gradient_accumulation_steps": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"fp16": { | ||
"enabled": "auto", | ||
"min_loss_scale": 1, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"initial_scale_power": 32 | ||
}, | ||
"bf16": { | ||
"enabled": "auto" | ||
}, | ||
"gradient_clipping": 1.0, | ||
"zero_optimization": { | ||
"stage": 2, | ||
"offload_param": { | ||
"device": "none" | ||
}, | ||
"offload_optimizer": { | ||
"device": "none" | ||
}, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 5e8, | ||
"contiguous_gradients": true | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": [ | ||
0.9, | ||
0.999 | ||
], | ||
"eps": 1e-08 | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupLR", | ||
"params": { | ||
"warmup_min_lr": 0, | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"warmup_type": "linear" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"train_batch_size": "auto", | ||
"gradient_accumulation_steps": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"fp16": { | ||
"enabled": "auto", | ||
"min_loss_scale": 1, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"initial_scale_power": 32 | ||
}, | ||
"bf16": { | ||
"enabled": "auto" | ||
}, | ||
"gradient_clipping": 1, | ||
"zero_optimization": { | ||
"stage": 2, | ||
"offload_param": { | ||
"device": "cpu" | ||
}, | ||
"offload_optimizer": { | ||
"device": "cpu" | ||
}, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 5e8, | ||
"contiguous_gradients": true | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": [ | ||
0.9, | ||
0.999 | ||
], | ||
"eps": 1e-08 | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupLR", | ||
"params": { | ||
"warmup_min_lr": 0, | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"warmup_type": "linear" | ||
} | ||
} | ||
} |
Oops, something went wrong.