Merge pull request X-LANCE#52 from ddlBoJack/dev-mzy

update example/asr_librispeech
whyrrrrun · Apr 26, 2024 · 4386c54 · 4386c54
2 parents 08277f4 + 7703ac5
commit 4386c54
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 35 deletions.
diff --git a/examples/asr_librispeech/README.md b/examples/asr_librispeech/README.md
@@ -0,0 +1,36 @@
+# ASR_Librispeech
+
+## Performance and checkpoints
+We only train the linear projector in this recipe.
+Encoder | Projector | LLM | test-clean | test-other
+|---|---|---|---|---
+[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear]()(~18.88M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 2.28 | 4.78
+
+
+## Data preparation
+You need to prepare the data jsonl in this format.
+```
+{"key": "1001-134707-0000_ASR", "source": "/data/open_data/librispeech_audio/audio/librispeech_1001-134707-0000.wav", "target": "1 little recks the laborer. How near his work is holding him to God, The loving laborer through space and time, after all, not to create, only or found only."}
+...
+{"key": "1001-134707-0000_ASR", "source": "/data/open_data/librispeech_audio/audio/librispeech_1001-134707-0000.wav", "target": "1 little recks the laborer. How near his work is holding him to God, The loving laborer through space and time, after all, not to create, only or found only."}
+```
+
+## Decode with checkpoints
+```
+bash decode_wavlm_large_linear_vicuna_7b.sh
+```
+Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path`, `val_data_path` and `decode_log` in the script when you run the shell script. 
+
+## Train a new model
+
+### Use whisper as the encoder
+```
+bash finetune_whisper_large_linear_vicuna_7b.sh
+```
+Whisper takes mel as input. Pay attention to the key `dataset_config.mel_size` for different version of the whisper model family. 
+
+### Use self-supervised model(such as WavLM) as the encoder
+```
+bash finetune_wavlm_large_linear_vicuna_7b.sh
+```
+WavLM takes raw wavform as input. Pay attention to the key `dataset_config.normalize` and `model_config.normalize` for different version of the SSL models for different SSL models are different in these keys. 
diff --git a/examples/asr_librispeech/asr_config.py b/examples/asr_librispeech/asr_config.py
@@ -14,6 +14,9 @@ class ModelConfig:
     encoder_projector: str = "linear"
     encoder_projector_ds_rate: int = 5
     modal: str = "audio"
+    normalize: Optional[bool] = field(default=False, metadata={
+        "help": "whether inpit is normalized, used for models such as wavlm"
+    })
 
 @dataclass
 class PeftConfig:
@@ -93,6 +96,9 @@ class DataConfig:
     mel_size: int = field(default=80, metadata={
         "help": "80 for whisper large v1 and v2, 128 for v3"
     })
+    normalize: Optional[bool] = field(default=False, metadata={
+        "help": "whether inpit is normalized, used for models such as wavlm"
+    })
 
 @dataclass
 class FSDPConfig:

diff --git a/examples/asr_librispeech/conf/prompt.yaml b/examples/asr_librispeech/conf/prompt.yaml
@@ -1,3 +1,4 @@
 dataset_config:
     # we put prompt here, because the hydra override in shell script only support a small subset of chars
-    prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. "
+    # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. "
+    prompt: "Transcribe speech to text. "
diff --git a/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#export PYTHONPATH=/root/whisper:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+export TOKENIZERS_PARALLELISM=false
+# export CUDA_LAUNCH_BLOCKING=1
+
+run_dir=/root/SLAM-LLM
+cd $run_dir
+code_dir=examples/asr_librispeech
+
+speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
+
+output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-wavlm-large-20240426
+ckpt_path=$output_dir/asr_epoch_1_step_1000
+split=librispeech_test_clean
+val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl
+decode_log=$ckpt_path/decode_${split}_beam4
+
+# -m debugpy --listen 5678 --wait-for-client
+python $code_dir/inference_asr_batch.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        hydra.run.dir=$ckpt_path \
+        ++model_config.llm_name="vicuna-7b-v1.5" \
+        ++model_config.llm_path=$llm_path \
+        ++model_config.llm_dim=4096 \
+        ++model_config.encoder_name=wavlm \
+        ++model_config.normalize=true \
+        ++dataset_config.normalize=true \
+        ++model_config.encoder_projector_ds_rate=5 \
+        ++model_config.encoder_path=$speech_encoder_path \
+        ++model_config.encoder_dim=1024 \
+        ++model_config.encoder_projector=linear \
+        ++dataset_config.dataset=speech_dataset \
+        ++dataset_config.val_data_path=$val_data_path \
+        ++dataset_config.input_type=raw \
+        ++dataset_config.inference_mode=true \
+        ++train_config.model_name=asr \
+        ++train_config.freeze_encoder=true \
+        ++train_config.freeze_llm=true \
+        ++train_config.batching_strategy=custom \
+        ++train_config.num_epochs=1 \
+        ++train_config.val_batch_size=4 \
+        ++train_config.num_workers_dataloader=2 \
+        ++train_config.output_dir=$output_dir \
+        ++decode_log=$decode_log \
+        ++ckpt_path=$ckpt_path/model.pt \
+        # ++peft_ckpt=$ckpt_path \
+        # ++train_config.use_peft=true \
+        # ++train_config.peft_config.r=32 \
+        # ++dataset_config.normalize=true \
+        # ++model_config.encoder_projector=q-former \
+        # ++dataset_config.fix_length_audio=64 \
+
+python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
+python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
+python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/...librispeech/scripts/decode_librispeech.sh → .../decode_whisper_large_linear_vicuna_7b.sh b/...librispeech/scripts/decode_librispeech.sh → .../decode_whisper_large_linear_vicuna_7b.sh
@@ -1,22 +1,20 @@
 #!/bin/bash
 #export PYTHONPATH=/root/whisper:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=0
 export TOKENIZERS_PARALLELISM=false
 # export CUDA_LAUNCH_BLOCKING=1
 
-run_dir=/work/SLAM-LLM
+run_dir=/root/SLAM-LLM
 cd $run_dir
-code_dir=examples/asr_librispeech/
+code_dir=examples/asr_librispeech
 
-speech_encoder_path=/cxgroup/model/whisper/large-v3.pt
+speech_encoder_path=/nfs/maziyang.mzy/models/Whisper/large-v3.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
 
-# llm_path=/cxgroup/model/Llama-2-7b-chat-hf
-llm_path=/cxgroup/model/vicuna-7b-v1.5
-
-output_dir=/work/exps/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-20240316-baseline
-ckpt_path=$output_dir/asr/3/846
-split=test-other
-val_data_path=data/librispeech/${split}.jsonl
+output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-20240426
+ckpt_path=$output_dir/asr_epoch_1_step_1000
+split=librispeech_test_clean
+val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl
 decode_log=$ckpt_path/decode_${split}_beam4
 
 # -m debugpy --listen 5678 --wait-for-client
@@ -28,6 +26,7 @@ python $code_dir/inference_asr_batch.py \
         ++model_config.llm_path=$llm_path \
         ++model_config.llm_dim=4096 \
         ++model_config.encoder_name=whisper \
+        ++model_config.encoder_projector_ds_rate=5 \
         ++model_config.encoder_path=$speech_encoder_path \
         ++model_config.encoder_dim=1280 \
         ++model_config.encoder_projector=linear \
@@ -37,24 +36,21 @@ python $code_dir/inference_asr_batch.py \
         ++dataset_config.mel_size=128 \
         ++dataset_config.inference_mode=true \
         ++train_config.model_name=asr \
+        ++train_config.freeze_encoder=true \
+        ++train_config.freeze_llm=true \
         ++train_config.batching_strategy=custom \
         ++train_config.num_epochs=1 \
         ++train_config.val_batch_size=4 \
-        ++train_config.num_workers_dataloader=4 \
+        ++train_config.num_workers_dataloader=2 \
         ++train_config.output_dir=$output_dir \
         ++decode_log=$decode_log \
         ++ckpt_path=$ckpt_path/model.pt \
-        ++train_config.freeze_encoder=true \
-        ++train_config.freeze_llm=true \
         # ++peft_ckpt=$ckpt_path \
         # ++train_config.use_peft=true \
         # ++train_config.peft_config.r=32 \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-        # --peft_ckpt $peft_ckpt \
-        # ++ckpt_path=$ckpt_path/model.pt \
-        # --use_peft --peft_method lora \
 
 python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
 python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc

diff --git a/examples/asr_librispeech/scripts/finetune_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/finetune_wavlm_large_linear_vicuna_7b.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# export PYTHONPATH=/root/whisper:$PYTHONPATH
+export PYTHONPATH=/root/fairseq:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0,1
+export TOKENIZERS_PARALLELISM=false
+# export CUDA_LAUNCH_BLOCKING=1
+export OMP_NUM_THREADS=1
+
+# debug setting for multiple gpus
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+# export TORCH_DISTRIBUTED_DEBUG=INFO
+
+run_dir=/root/SLAM-LLM
+cd $run_dir
+code_dir=examples/asr_librispeech
+
+speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
+train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl
+val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl
+
+output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-wavlm-large-$(date +"%Y%m%d")
+
+hydra_args="
+hydra.run.dir=$output_dir \
+++model_config.llm_name=vicuna-7b-v1.5 \
+++model_config.llm_path=$llm_path \
+++model_config.llm_dim=4096 \
+++model_config.encoder_name=wavlm \
+++model_config.normalize=true \
+++dataset_config.normalize=true \
+++model_config.encoder_projector_ds_rate=5 \
+++model_config.encoder_path=$speech_encoder_path \
+++model_config.encoder_dim=1024 \
+++model_config.encoder_projector=linear \
+++dataset_config.dataset=speech_dataset \
+++dataset_config.train_data_path=$train_data_path \
+++dataset_config.val_data_path=$val_data_path \
+++dataset_config.input_type=raw \
+++train_config.model_name=asr \
+++train_config.num_epochs=3 \
+++train_config.freeze_encoder=true \
+++train_config.freeze_llm=true \
+++train_config.batching_strategy=custom \
+++train_config.warmup_steps=1000 \
+++train_config.total_steps=100000 \
+++train_config.lr=1e-4 \
+++train_config.validation_interval=1000 \
+++train_config.batch_size_training=4 \
+++train_config.val_batch_size=4 \
+++train_config.num_workers_dataloader=2 \
+++train_config.output_dir=$output_dir \
+++metric=acc \
+"
+
+# -m debugpy --listen 5678 --wait-for-client
+if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
+    python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        $hydra_args
+else
+    torchrun \
+        --nnodes 1 \
+        --nproc_per_node 2 \
+        --master_port=29503 \
+        $code_dir/finetune_asr.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        ++train_config.enable_fsdp=false \
+        ++train_config.enable_ddp=true \
+        ++train_config.use_fp16=true \
+        $hydra_args
+fi
diff --git a/.../scripts/finetune_librspeech_vicuna_7b.sh → ...inetune_whisper_large_linear_vicuna_7b.sh b/.../scripts/finetune_librspeech_vicuna_7b.sh → ...inetune_whisper_large_linear_vicuna_7b.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # export PYTHONPATH=/root/whisper:$PYTHONPATH
 export PYTHONPATH=/root/fairseq:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=4,5
+export CUDA_VISIBLE_DEVICES=0,1
 export TOKENIZERS_PARALLELISM=false
 # export CUDA_LAUNCH_BLOCKING=1
 export OMP_NUM_THREADS=1
@@ -11,16 +11,16 @@ export OMP_NUM_THREADS=1
 # export NCCL_DEBUG_SUBSYS=ALL
 # export TORCH_DISTRIBUTED_DEBUG=INFO
 
-run_dir=/work/SLAM-LLM
+run_dir=/root/SLAM-LLM
 cd $run_dir
-code_dir=examples/asr_librispeech/
+code_dir=examples/asr_librispeech
 
-speech_encoder_path=/cxgroup/model/whisper/large-v3.pt
+speech_encoder_path=/nfs/maziyang.mzy/models/Whisper/large-v3.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
+train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl
+val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl
 
-llm_path=/cxgroup/model/vicuna-7b-v1.5
-# llm_path=/nfs/maziyang.mzy/models/vicuna-13b-v1.5
-
-output_dir=/work/exps/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-$(date +"%Y%m%d")
+output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-$(date +"%Y%m%d")
 
 hydra_args="
 hydra.run.dir=$output_dir \
@@ -33,8 +33,8 @@ hydra.run.dir=$output_dir \
 ++model_config.encoder_dim=1280 \
 ++model_config.encoder_projector=linear \
 ++dataset_config.dataset=speech_dataset \
-++dataset_config.train_data_path=data/librispeech/train960.jsonl \
-++dataset_config.val_data_path=data/librispeech/dev.jsonl \
+++dataset_config.train_data_path=$train_data_path \
+++dataset_config.val_data_path=$val_data_path \
 ++dataset_config.input_type=mel \
 ++dataset_config.mel_size=128 \
 ++train_config.model_name=asr \
@@ -55,7 +55,7 @@ hydra.run.dir=$output_dir \
 
 # -m debugpy --listen 5678 --wait-for-client
 if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
-    python -m debugpy --listen 5678 --wait-for-client finetune_asr.py \
+    python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \
         --config-path "conf" \
         --config-name "prompt.yaml" \
         $hydra_args
@@ -72,6 +72,3 @@ else
         ++train_config.use_fp16=true \
         $hydra_args
 fi
-
-# {"key": "1001-134707-0000_ASR", "prompt": "<ASR>", "source": "/cpfs01/shared/Group-speech/beinian.lzr/data/open_data/librispeech_audio/audio/se_librispeech_1001-134707-0000.wav", "target": "1 little recks the laborer. How near his work is holding him to God, The loving laborer through space and time, after all, not to create, only or found only.", "target_len": 157, "source_len": 1581, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}
-# {"key": "1688-142285-0005", "prompt": "<ASR>", "source": "/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/librispeech/test_other/wav/1688-142285-0005.wav", "target": "YOU WHO WERE ALWAYS ACCUSING PEOPLE OF BEING SHOPPY AT HELSTONE", "target_len": 11, "source_len": 220, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}
diff --git a/...cripts/deepspeed_librispeech_vicuna_7b.sh → ...isper_large_linear_vicuna_7b_deepspeed.sh b/...cripts/deepspeed_librispeech_vicuna_7b.sh → ...isper_large_linear_vicuna_7b_deepspeed.sh
@@ -90,6 +90,3 @@ deepspeed \
 #         $hydra_args
 #         # --num_gpus=2 \
 # fi
-
-# {"key": "1001-134707-0000_ASR", "prompt": "<ASR>", "source": "/cpfs01/shared/Group-speech/beinian.lzr/data/open_data/librispeech_audio/audio/se_librispeech_1001-134707-0000.wav", "target": "1 little recks the laborer. How near his work is holding him to God, The loving laborer through space and time, after all, not to create, only or found only.", "target_len": 157, "source_len": 1581, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}
-# {"key": "1688-142285-0005", "prompt": "<ASR>", "source": "/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/librispeech/test_other/wav/1688-142285-0005.wav", "target": "YOU WHO WERE ALWAYS ACCUSING PEOPLE OF BEING SHOPPY AT HELSTONE", "target_len": 11, "source_len": 220, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}