applied LoRA to DeBERTa and RoBERTa

Peng-Qian · Sep 20, 2021 · b80d0c4 · b80d0c4
1 parent ff46e01
commit b80d0c4
Show file tree

Hide file tree

Showing 53 changed files with 1,705 additions and 375 deletions.
diff --git a/NLU/README.md b/NLU/README.md
diff --git a/NLU/deberta_v2_xxlarge_cola.sh b/NLU/deberta_v2_xxlarge_cola.sh
@@ -0,0 +1,31 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./cola"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--task_name cola \
+--do_train \
+--do_eval \
+--max_seq_length 64 \
+--per_device_train_batch_size 4 \
+--learning_rate 1.3e-4 \
+--num_train_epochs 10 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 20 \
+--save_strategy steps \
+--save_steps 20 \
+--warmup_steps 100 \
+--cls_dropout 0.1 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_mnli.sh b/NLU/deberta_v2_xxlarge_mnli.sh
@@ -0,0 +1,31 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./mnli"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--task_name mnli \
+--do_train \
+--do_eval \
+--max_seq_length 256 \
+--per_device_train_batch_size 8 \
+--learning_rate 1e-4 \
+--num_train_epochs 5 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 500 \
+--save_strategy steps \
+--save_steps 500 \
+--warmup_steps 1000 \
+--cls_dropout 0.15 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_mrpc.sh b/NLU/deberta_v2_xxlarge_mrpc.sh
@@ -0,0 +1,32 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./mrpc"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--lora_path mnli/pytorch_model_lora.bin \
+--task_name mrpc \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 32 \
+--learning_rate 2e-4 \
+--num_train_epochs 30 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 10 \
+--save_strategy steps \
+--save_steps 10 \
+--warmup_ratio 0.1 \
+--cls_dropout 0 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.01 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_qnli.sh b/NLU/deberta_v2_xxlarge_qnli.sh
@@ -0,0 +1,31 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./qnli"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--task_name qnli \
+--do_train \
+--do_eval \
+--max_seq_length 512 \
+--per_device_train_batch_size 6 \
+--learning_rate 1e-4 \
+--num_train_epochs 8 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 500 \
+--save_strategy steps \
+--save_steps 500 \
+--warmup_steps 500 \
+--cls_dropout 0.1 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.01 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_qqp.sh b/NLU/deberta_v2_xxlarge_qqp.sh
@@ -0,0 +1,31 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./qqp"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--task_name qqp \
+--do_train \
+--do_eval \
+--max_seq_length 320 \
+--per_device_train_batch_size 8 \
+--learning_rate 1e-4 \
+--num_train_epochs 11 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 500 \
+--save_strategy steps \
+--save_steps 500 \
+--warmup_steps 10000 \
+--cls_dropout 0.2 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.01 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_rte.sh b/NLU/deberta_v2_xxlarge_rte.sh
@@ -0,0 +1,32 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./rte"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--lora_path mnli/pytorch_model_lora.bin \
+--task_name rte \
+--do_train \
+--do_eval \
+--max_seq_length 320 \
+--per_device_train_batch_size 4 \
+--learning_rate 2.6e-4 \
+--num_train_epochs 11 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 20 \
+--save_strategy steps \
+--save_steps 20 \
+--warmup_steps 50 \
+--cls_dropout 0.2 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.01 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_sst2.sh b/NLU/deberta_v2_xxlarge_sst2.sh
@@ -0,0 +1,31 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./sst2"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--task_name sst2 \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 8 \
+--learning_rate 6e-5 \
+--num_train_epochs 16 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 100 \
+--save_strategy steps \
+--save_steps 100 \
+--warmup_steps 1000 \
+--cls_dropout 0 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.01 \
+--use_deterministic_algorithms
diff --git a/NLU/deberta_v2_xxlarge_stsb.sh b/NLU/deberta_v2_xxlarge_stsb.sh
@@ -0,0 +1,32 @@
+export num_gpus=8
+export CUBLAS_WORKSPACE_CONFIG=":16:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+export PYTHONHASHSEED=0
+export output_dir="./stsb"
+python -m torch.distributed.launch --nproc_per_node=$num_gpus \
+examples/text-classification/run_glue.py \
+--model_name_or_path microsoft/deberta-v2-xxlarge \
+--lora_path mnli/pytorch_model_lora.bin \
+--task_name stsb \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 4 \
+--learning_rate 2e-4 \
+--num_train_epochs 10 \
+--output_dir $output_dir/model \
+--overwrite_output_dir \
+--logging_steps 10 \
+--logging_dir $output_dir/log \
+--fp16 \
+--evaluation_strategy steps \
+--eval_steps 50 \
+--save_strategy steps \
+--save_steps 50 \
+--warmup_steps 100 \
+--cls_dropout 0.2 \
+--apply_lora \
+--lora_r 16 \
+--lora_alpha 32 \
+--seed 0 \
+--weight_decay 0.1 \
+--use_deterministic_algorithms
diff --git a/NLU/ds_config.json b/NLU/ds_config.json
@@ -0,0 +1,25 @@
+{
+  "fp16": {
+
+    "enabled": true,
+
+    "initial_scale_power": 12
+
+  },
+  "zero_optimization": {
+
+    "stage": 2,
+
+    "reduce_bucket_size": 5e7,
+
+    "allgather_bucket_size": 1.25e9,
+
+    "overlap_comm": true,
+
+    "contiguous_gradients": true
+
+  },
+  "zero_allow_untested_optimizer": true
+
+}
+
diff --git a/NLU/environment.yml b/NLU/environment.yml
@@ -0,0 +1,106 @@
+name: NLU
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2021.7.5=h06a4308_1
+  - certifi=2021.5.30=py37h06a4308_0
+  - cudatoolkit=11.1.74=h6bb024c_0
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.10.4=h5ab3b9f_0
+  - gmp=6.2.1=h2531618_2
+  - gnutls=3.6.15=he1e5248_0
+  - intel-openmp=2021.2.0=h06a4308_610
+  - jpeg=9b=h024ee3a_2
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libiconv=1.15=h63c8f33_5
+  - libidn2=2.3.1=h27cfd23_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtasn1=4.16.0=h27cfd23_0
+  - libtiff=4.2.0=h85742a9_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp-base=1.2.0=h27cfd23_0
+  - lz4-c=1.9.3=h2531618_0
+  - mkl=2021.2.0=h06a4308_296
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.0=py37h42c9631_2
+  - mkl_random=1.2.1=py37ha9443f7_2
+  - ncurses=6.2=he6710b0_1
+  - nettle=3.7.3=hbbd107a_1
+  - numpy=1.20.2=py37h2d18471_0
+  - numpy-base=1.20.2=py37hfae3a4d_0
+  - olefile=0.46=py37_0
+  - openh264=2.1.0=hd408876_0
+  - openjpeg=2.3.0=h05c96fa_1
+  - openssl=1.1.1k=h27cfd23_0
+  - pillow=8.3.1=py37h2c7a002_0
+  - pip=21.1.3=py37h06a4308_0
+  - python=3.7.10=h12debd9_4
+  - pytorch=1.9.0=py3.7_cuda11.1_cudnn8.0.5_0
+  - readline=8.1=h27cfd23_0
+  - setuptools=52.0.0=py37h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_0
+  - sqlite=3.36.0=hc218d9a_0
+  - tk=8.6.10=hbc83047_0
+  - torchaudio=0.9.0=py37
+  - torchvision=0.10.0=py37_cu111
+  - typing_extensions=3.10.0.0=pyh06a4308_0
+  - wheel=0.36.2=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - accelerate==0.3.0
+    - charset-normalizer==2.0.1
+    - click==8.0.1
+    - datasets==1.9.0
+    - deepspeed==0.5.0
+    - dill==0.3.4
+    - filelock==3.0.12
+    - fsspec==2021.7.0
+    - huggingface-hub==0.0.13
+    - idna==3.2
+    - importlib-metadata==4.6.1
+    - joblib==1.0.1
+    - multiprocess==0.70.12.2
+    - ninja==1.10.0.post2
+    - packaging==21.0
+    - pandas==1.3.0
+    - protobuf==3.17.3
+    - psutil==5.8.0
+    - pyaml==20.4.0
+    - pyarrow==4.0.1
+    - pyparsing==2.4.7
+    - python-dateutil==2.8.1
+    - pytz==2021.1
+    - pyyaml==5.4.1
+    - regex==2021.7.6
+    - requests==2.26.0
+    - sacremoses==0.0.45
+    - scikit-learn==0.24.2
+    - scipy==1.7.0
+    - sentencepiece==0.1.96
+    - sklearn==0.0
+    - tensorboardx==1.8
+    - threadpoolctl==2.2.0
+    - tokenizers==0.10.3
+    - tqdm==4.61.2
+    - triton==0.4.2
+    - urllib3==1.26.5
+    - xxhash==2.0.2
+    - zipp==3.5.0
+    - wrapt==1.12.1
+    - azureml-core==1.32.0
+prefix: /opt/conda/envs/transformers