Skip to content

Commit

Permalink
raw add for testing
Browse files Browse the repository at this point in the history
Signed-off-by: ftgreat <[email protected]>
  • Loading branch information
ftgreat committed Jun 6, 2023
2 parents f5b51e3 + d48fa78 commit 65551f9
Show file tree
Hide file tree
Showing 353 changed files with 2,671,010 additions and 312 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,8 @@ qqp
glm_large_qqp_pytorch
wandb
clip_benchmark_datasets
examples/AltCLIP/clip_benchmark_datasets
examples/glm_pretrain/data.lazy
examples/glm_pretrain/examples/glm_pretrain/data.lazy
examples/vit_cifar100/cifar100
examples/vit_cifar100/data
Empty file modified CHANGELOG.md
100644 → 100755
Empty file.
Empty file modified CLA.md
100644 → 100755
Empty file.
Empty file modified CODE_OF_CONDUCT.md
100644 → 100755
Empty file.
Empty file modified COMMITTERS.csv
100644 → 100755
Empty file.
Empty file modified CONTRIBUTING.md
100644 → 100755
Empty file.
Empty file modified Dockerfile
100644 → 100755
Empty file.
Empty file modified GOVERNANCE.md
100644 → 100755
Empty file.
Empty file modified LICENSE
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
Empty file modified README_zh.md
100644 → 100755
Empty file.
Empty file modified SUPPORT.md
100644 → 100755
Empty file.
25 changes: 25 additions & 0 deletions examples/aquila/Aquila-30b-64n8g.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# comments
batch_size: 4
gradient_accumulation_steps: 1
lr: 1.2e-4
warm_up: 0.01
save_interval: 500
weight_decay: 0.1

bmt_cpu_offload: False
bmt_pre_load: False
bmt_async_load: True
# bmt_loss_scale: 65536
bmt_loss_scale: 65536.0

save_optim: False
save_rng: False

# load_dir: '/share/project/64node-bmt-flashatten/checkpoints/Aquila-30b-64n8g-from-scratch'
load_optim: False
resume_dataset: False
bmt_lr_decay_style: 'consine'
# warm_up_iters: 2000
# iteration_in_epoch: 9000


16 changes: 16 additions & 0 deletions examples/aquila/Aquila-7b-1n8g-code.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# comments
batch_size: 10
gradient_accumulation_steps: 1
lr: 2.e-4
warm_up: 0.001
save_interval: 500

bmt_cpu_offload: False
bmt_pre_load: False
bmt_async_load: False
bmt_loss_scale: 65536

save_optim: True
save_rng: True

load_optim: False
18 changes: 18 additions & 0 deletions examples/aquila/Aquila-7b-1n8g.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# comments
batch_size: 4
gradient_accumulation_steps: 1
lr: 2.e-4
warm_up: 0.001
save_interval: 500

bmt_cpu_offload: False
bmt_pre_load: False
bmt_async_load: False
bmt_loss_scale: 65536

save_optim: True
save_rng: True

load_optim: False


33 changes: 33 additions & 0 deletions examples/aquila/Aquila-7b-24n8g-exp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# comments
batch_size: 10
gradient_accumulation_steps: 1
lr: 2.99e-4
warm_up: 0.01
save_interval: 10
log_interval: 10

bmt_cpu_offload: False
bmt_pre_load: False
#bmt_async_load: True
# bmt_loss_scale: 65536
bmt_loss_scale: 1048576.0

#enable_weighted_dataset_v2: False
#bmt_cpu_offload: False
#bmt_loss_scale: 131072

save_optim: True
save_rng: True

#save_optim: False
#save_rng: False
#load_rng: True
#load_dir: '/data2/checkpoints/Aquila-7b-24n8g-V3'
#load_optim: True
#resume_dataset: False
#skip_iters: 15000
#bmt_lr_decay_style: 'consine'
# warm_up_iters: 2000
# iteration_in_epoch: 9000


33 changes: 33 additions & 0 deletions examples/aquila/Aquila-7b-24n8g.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# comments
batch_size: 10
gradient_accumulation_steps: 1
lr: 2.4e-4
warm_up: 0.01
save_interval: 1000
log_interval: 10

bmt_cpu_offload: False
bmt_pre_load: False
#bmt_async_load: True
# bmt_loss_scale: 65536
bmt_loss_scale: 1048576.0

#enable_weighted_dataset_v2: False
#bmt_cpu_offload: False
#bmt_loss_scale: 131072

save_optim: True
save_rng: True

#save_optim: False
#save_rng: False
#load_rng: True
#load_dir: '/data2/checkpoints/Aquila-7b-24n8g-V3'
#load_optim: True
resume_dataset: False
skip_iters: 17000
#bmt_lr_decay_style: 'consine'
# warm_up_iters: 2000
# iteration_in_epoch: 9000


19 changes: 19 additions & 0 deletions examples/aquila/Aquila-7b-sft-3n8g-convo-v2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# comments
epochs: 1
batch_size: 16
gradient_accumulation_steps: 1
lr: 2.0e-5
warm_up: 0.03
#warm_up: 0.06
save_interval: 2000

weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999

bmt_cpu_offload: False

bmt_pre_load: True
enable_sft_conversations_dataset: True
enable_sft_dataset_dir: '/data/'
enable_sft_dataset_file: 'merge_chat_clean_convo_dataset.jsonl'
18 changes: 18 additions & 0 deletions examples/aquila/Aquila-7b-sft-4n8g-10m-convo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# comments
epochs: 1
batch_size: 4
gradient_accumulation_steps: 1
lr: 2.0e-5
warm_up_iters: 360
save_interval: 2000

weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999

bmt_cpu_offload: False

bmt_pre_load: True
enable_sft_conversations_dataset: True
enable_sft_dataset_dir: '/data/ldwang/sft_datasets'
enable_sft_dataset_file: 'merge_chat_clean_convo_dataset.jsonl'
19 changes: 19 additions & 0 deletions examples/aquila/Aquila-7b-sft-6n8g-10m-convo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# comments
epochs: 1
batch_size: 8
gradient_accumulation_steps: 1
lr: 2.0e-5
warm_up_iters: 360
warm_up: 0.06
save_interval: 2000

weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999

bmt_cpu_offload: False

bmt_pre_load: True
enable_sft_conversations_dataset: True
enable_sft_dataset_dir: '/data/ldwang/sft_datasets'
enable_sft_dataset_file: 'merge_chat_clean_convo_dataset.jsonl'
98 changes: 98 additions & 0 deletions examples/aquila/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# ENVS
# TODO
export FLAGAI_HOME=/share/project/flagai-internal-bmt-flashatten
export WORKSPACE=/share/project/64node-bmt-flashatten

export PYTHONPATH=$FLAGAI_HOME
# export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_CUDA_SUPPORT=1
# export NCCL_IB_GID_INDEX=0
export NCCL_IB_HCA=mlx5_0,mlx5_3
export NCCL_DEBUG=debug
export OMP_NUM_THREADS=4

echo "[INFO] $0: hostfile configfile model_name exp_name exp_version"
set -u
hostfile=$1
configfile=$2
model_name=$3
exp_name=$4
exp_version=$5
set +u
# DIST
export HOSTFILE=$hostfile
export CONFIGFILE=$configfile
export NODE_ADDR=$(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2;}'|tr -d "addr:")
export GPU_NUM_PER_NODE=$(awk -F" |=" '{ranks[$1]=$NF;}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
export NODES_NUM=$(cat $HOSTFILE | wc -l)
export MASTER_ADDR=$(head -n1 $HOSTFILE | awk '{print $1;}')
export RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
export MASTER_PORT=23456

export TRIGGER_FILE=bmtrain_mgpu.sh
export SCRIPT_FILE=train_llama_bmtrain_datasets.py

## wandb
export WANDB_MODE=offline

## EXP
#export EXP_NAME=llama_7b_8n8g
export EXP_NAME=Aquila-7b-16n8g
export EXP_NAME=Aquila-7b-1n8g
export EXP_NAME=Aquila-7b-6n8g
#export MODEL_NAME=llama-7b-en
export MODEL_NAME=Aquila-7b

export MODEL_NAME=$model_name
export EXP_NAME=$exp_name

export STATE_DICT_DIR=$WORKSPACE/state_dict
export SAVE_DIR=$WORKSPACE/checkpoints/${EXP_NAME}
export WANDB_DIR=$WORKSPACE/wandb/${EXP_NAME}
export EXP_VERSION_DIR=$SAVE_DIR/$exp_version
mkdir -p $EXP_VERSION_DIR
mkdir -p $WANDB_DIR
## Backup ckpts & scripts into exp versions
# cp -r $STATE_DICT_DIR/$MODEL_NAME $EXP_VERSION_DIR
cp -r $WORKSPACE/$TRIGGER_FILE $EXP_VERSION_DIR
cp -r $hostfile $EXP_VERSION_DIR
cp -r $configfile $EXP_VERSION_DIR

export EPOCH_NUM=1
export BATCH_SIZE=6
export GRADIENT_ACCUM_STEPS=1
export LR=3.0e-4
export LR=1.0e-5
export LR=6.0e-5
export WARMUP_RATE=0.008
export WARMUP_RATE=0.02
export WARMUP_RATE=0.1
export WARMUP_RATE=0.2

## EXTRA OPTS
OPTS=" --batch_size $BATCH_SIZE \
--epochs $EPOCH_NUM \
--gradient_accumulation_steps $GRADIENT_ACCUM_STEPS \
--lr $LR \
--warm_up $WARMUP_RATE \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--save_dir $SAVE_DIR \
--experiment_name $EXP_NAME \
--model_name $MODEL_NAME \
--wandb_dir $WANDB_DIR \
--yaml_config $CONFIGFILE"

## Trigger job on Each Node when bmt or ddp.
python -u -m torch.distributed.launch \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
$SCRIPT_FILE \
--not_call_launch \
$OPTS

99 changes: 99 additions & 0 deletions examples/aquila/bmtrain_mgpu_flash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# ENVS
# TODO
export FLAGAI_HOME=/data/ldwang/workspace/FlagAI
export WORKSPACE=/data/workspace_v3/bmt-flashatten-1T

export PYTHONPATH=$FLAGAI_HOME
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_IB_GID_INDEX=0
export NCCL_IB_HCA=mlx5_2,mlx5_5
export NCCL_DEBUG=debug
export OMP_NUM_THREADS=4

echo "[INFO] $0: hostfile configfile model_name exp_name exp_version"
set -u
hostfile=$1
configfile=$2
model_name=$3
exp_name=$4
exp_version=$5
set +u
# DIST
export HOSTFILE=$hostfile
export CONFIGFILE=$configfile
export NODE_ADDR=$(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2;}'|tr -d "addr:")
export GPU_NUM_PER_NODE=$(awk -F" |=" '{ranks[$1]=$NF;}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
export NODES_NUM=$(cat $HOSTFILE | wc -l)
export MASTER_ADDR=$(head -n1 $HOSTFILE | awk '{print $1;}')
export RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
export MASTER_PORT=23456

export TRIGGER_FILE=bmtrain_mgpu_flash.sh

export SCRIPT_FILE=train_llama_bmtrain_datasets-flash-1T.py

## wandb
export WANDB_MODE=offline

## EXP
#export EXP_NAME=llama_7b_8n8g
export EXP_NAME=Aquila-7b-16n8g
export EXP_NAME=Aquila-7b-1n8g
export EXP_NAME=Aquila-7b-6n8g
#export MODEL_NAME=llama-7b-en
export MODEL_NAME=Aquila-7b

export MODEL_NAME=$model_name
export EXP_NAME=$exp_name

export STATE_DICT_DIR=$WORKSPACE/state_dict
export SAVE_DIR=/data2/checkpoints/${EXP_NAME}
export WANDB_DIR=$WORKSPACE/wandb/${EXP_NAME}
export EXP_VERSION_DIR=$SAVE_DIR/$exp_version
mkdir -p $EXP_VERSION_DIR
mkdir -p $WANDB_DIR
## Backup ckpts & scripts into exp versions
# cp -r $STATE_DICT_DIR/$MODEL_NAME $EXP_VERSION_DIR
cp -r $WORKSPACE/$TRIGGER_FILE $EXP_VERSION_DIR
cp -r $hostfile $EXP_VERSION_DIR
cp -r $configfile $EXP_VERSION_DIR

export EPOCH_NUM=1
export BATCH_SIZE=6
export GRADIENT_ACCUM_STEPS=1
export LR=3.0e-4
export LR=1.0e-5
export LR=6.0e-5
export WARMUP_RATE=0.008
export WARMUP_RATE=0.02
export WARMUP_RATE=0.1
export WARMUP_RATE=0.2

## EXTRA OPTS
OPTS=" --batch_size $BATCH_SIZE \
--epochs $EPOCH_NUM \
--gradient_accumulation_steps $GRADIENT_ACCUM_STEPS \
--lr $LR \
--warm_up $WARMUP_RATE \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--save_dir $SAVE_DIR \
--experiment_name $EXP_NAME \
--model_name $MODEL_NAME \
--wandb_dir $WANDB_DIR \
--yaml_config $CONFIGFILE"

## Trigger job on Each Node when bmt or ddp.
python -u -m torch.distributed.launch \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
$SCRIPT_FILE \
--not_call_launch \
$OPTS

Loading

0 comments on commit 65551f9

Please sign in to comment.