forked from bigcode-project/octopack
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetune_starcoderbase.sh
145 lines (121 loc) · 4.46 KB
/
finetune_starcoderbase.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
#SBATCH --job-name=bigcode
#SBATCH --partition=gpu_p5
#SBATCH --constraint=a100
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=64 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --gres=gpu:8 # number of gpus
#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --account=ajs@a100
set -x -e
source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
# defining the right environment variables
export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
echo "START TIME: $(date)"
GPUS_PER_NODE=8
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6001
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
#CHECKPOINT_PATH=/gpfswork/rech/ajs/commun/code/bigcode/finetune/bigcode
CHECKPOINT_PATH=/gpfsscratch/rech/ajs/commun/Bigcode-large-megatron
WEIGHTS_TRAIN=/gpfswork/rech/ajs/commun/code/bigcode/finetune/train_data_paths_big.txt.tmp
# "train: 1.0 0:0.950 /gpfswork/rech/ajs/commun/code/bigcode/finetune/train_bigcode"
WEIGHTS_VALID=/gpfswork/rech/ajs/commun/code/bigcode/finetune/valid_data_paths_big.txt.tmp
# "validation: 1.0 0.950:1 /gpfswork/rech/ajs/commun/code/bigcode/finetune/train_bigcode"
TOKENIZER_FILE=/gpfsscratch/rech/ajs/commun/large-model/tokenizer.json
LOG_PATH=$CHECKPOINT_PATH/main_log.txt
cd Megatron-LM
# Changes:
# No use-flash-attn ; Our masking is different, so not sure it will work; Only a speed difference anyways
# Doubled LR
# 50000 steps (Using 3x the batch size of QL, so 50K steps would be 150K steps in QL setup)
# Pretraining script:
# https://github.com/bigcode-project/Megatron-LM/blob/22b86119ef3d42879ac949cdf1a37056b0156049/examples/pretrain_bigcode_model.slurm
GPT_ARGS="\
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 4 \
--sequence-parallel \
--recompute-activations \
--num-layers 40 \
--hidden-size 6144 \
--num-attention-heads 48 \
--attention-head-type multiquery \
--init-method-std 0.01275 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--attention-dropout 0.1 \
--hidden-dropout 0.1 \
--micro-batch-size 1 \
--global-batch-size 128 \
--lr 0.00005 \
--min-lr 0.000005 \
--train-iters 10000 \
--lr-decay-iters 10000 \
--lr-decay-style cosine \
--lr-warmup-fraction 0.01 \
--weight-decay .05 \
--adam-beta2 .95 \
--clip-grad 1.0 \
--bf16 \
--log-interval 10 \
--save-interval 250 \
--eval-interval 250 \
--eval-iters 2 \
--use-distributed-optimizer \
--valid-num-workers 0 \
--reset-progress \
--no-load-rng \
--no-load-optim \
--finetune \
--norm-target-loss \
--loss-on-targets-only \
"
#--fim-rate 0.5 \
TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
CMD=" \
finetune_mtf.py \
$GPT_ARGS \
--tokenizer-type TokenizerFromFile \
--tokenizer-file $TOKENIZER_FILE \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
--valid-weighted-split-paths-path $WEIGHTS_VALID \
--structured-logs \
--structured-logs-dir $CHECKPOINT_PATH/logs \
$TENSORBOARD_ARGS \
"
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
echo $CMD
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"
# do not remove or the training will hang and nodes will be lost w/o this workaround
export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
# rm -rf $CHECKPOINT_PATH
echo "END TIME: $(date)"