Add VitsSVC implementation (open-mmlab#14)

Add the VAE- and Flow-based model: VitsSVC, which is implemented like so-vits-svc
thanhkm · Dec 8, 2023 · 554b791 · 554b791
1 parent ea507af
commit 554b791
Show file tree

Hide file tree

Showing 16 changed files with 1,349 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -42,6 +42,7 @@ ckpts
 *.pt
 *.npy
 *.npz
+!modules/whisper_extractor/assets/mel_filters.npz
 *.tar.gz
 *.ckpt
 *.wav

diff --git a/bins/svc/inference.py b/bins/svc/inference.py
@@ -14,6 +14,7 @@
 from models.svc.diffusion.diffusion_inference import DiffusionInference
 from models.svc.comosvc.comosvc_inference import ComoSVCInference
 from models.svc.transformer.transformer_inference import TransformerInference
+from models.svc.vits.vits_inference import VitsInference
 from utils.util import load_config
 from utils.audio_slicer import split_audio, merge_segments_encodec
 from processors import acoustic_extractor, content_extractor
@@ -24,6 +25,7 @@ def build_inference(args, cfg, infer_type="from_dataset"):
         "DiffWaveNetSVC": DiffusionInference,
         "DiffComoSVC": ComoSVCInference,
         "TransformerSVC": TransformerInference,
+        "VitsSVC": VitsInference,
     }
 
     inference_class = supported_inference[cfg.model_type]
@@ -48,9 +50,10 @@ def prepare_for_audio_file(args, cfg, num_workers=1):
     acoustic_extractor.extract_utt_acoustic_features_serial(
         metadata, temp_audio_dir, cfg
     )
-    acoustic_extractor.cal_mel_min_max(
-        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
-    )
+    if cfg.preprocess.use_min_max_norm_mel == True:
+        acoustic_extractor.cal_mel_min_max(
+            dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+        )
     acoustic_extractor.cal_pitch_statistics_svc(
         dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
     )

diff --git a/bins/svc/train.py b/bins/svc/train.py
@@ -10,6 +10,7 @@
 from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
 from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
 from models.svc.transformer.transformer_trainer import TransformerTrainer
+from models.svc.vits.vits_trainer import VitsSVCTrainer
 from utils.util import load_config
 
 
@@ -18,6 +19,7 @@ def build_trainer(args, cfg):
         "DiffWaveNetSVC": DiffusionTrainer,
         "DiffComoSVC": ComoSVCTrainer,
         "TransformerSVC": TransformerTrainer,
+        "VitsSVC": VitsSVCTrainer,
     }
 
     trainer_class = supported_trainer[cfg.model_type]

diff --git a/config/vitssvc.json b/config/vitssvc.json
@@ -0,0 +1,192 @@
+{
+    "base_config": "config/base.json",
+    "model_type": "VITS",
+    "task_type": "svc",
+    "preprocess": {
+        "extract_phone": false,
+        "extract_mel": true,
+        "extract_linear_spec": true,
+        "extract_audio": true,
+        "use_linear": true,
+        "use_mel": true,
+        "use_audio": true,
+        "use_text": false,
+        "use_phone": true,
+
+        "fmin": 0,
+        "fmax": null,
+        "f0_min": 50,
+        "f0_max": 1100,
+        // f0_bin in sovits
+        "pitch_bin": 256,
+        // filter_length in sovits
+        "n_fft": 2048,
+        // hop_length in sovits
+        "hop_size": 512,
+        // win_length in sovits
+        "win_size": 2048,
+        "segment_size": 8192,
+        "n_mel": 100,
+        "sample_rate": 44100,
+
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 196,
+            "input_loudness_dim": 1,
+            "use_log_loudness": false,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 196,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 196,
+            "output_singer_dim": 196,
+            "singer_table_size": 512,
+            "output_content_dim": 196,
+            "use_spkid": true
+        },
+        "vits": {
+            "filter_channels": 256,
+            "gin_channels": 256,
+            "hidden_channels": 192,
+            "inter_channels": 192,
+            "kernel_size": 3,
+            "n_flow_layer": 4,
+            "n_heads": 2,
+            "n_layers": 6,
+            "n_layers_q": 3,
+            "n_speakers": 512,
+            "p_dropout": 0.1,
+            "ssl_dim": 256,
+            "use_spectral_norm": false,
+        },
+        "generator": "hifigan",
+        "generator_config": {
+            "hifigan": {
+                "resblock": "1",
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "upsample_rates": [
+                    8,8,2,2,2
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4
+                ],
+                "upsample_initial_channel": 512,
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "melgan": {
+                "ratios": [8, 8, 2, 2, 2],
+                "ngf": 32,
+                "n_residual_layers": 3,
+                "num_D": 3,
+                "ndf": 16,
+                "n_layers": 4,
+                "downsampling_factor": 4
+            },
+            "bigvgan": {
+                "resblock": "1",
+                "activation": "snakebeta",
+                "snake_logscale": true,
+                "upsample_rates": [
+                    8,8,2,2,2,
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4,
+                ],
+                "upsample_initial_channel": 512,
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "nsfhifigan": {
+                "resblock": "1",
+                "harmonic_num": 8,
+                "upsample_rates": [
+                    8,8,2,2,2,
+                ],
+                "upsample_kernel_sizes": [
+                    16,16,4,4,4,
+                ],
+                "upsample_initial_channel": 768,
+                "resblock_kernel_sizes": [
+                    3,
+                    7,
+                    11
+                ],
+                "resblock_dilation_sizes": [
+                    [1,3,5],
+                    [1,3,5],
+                    [1,3,5]
+                ]
+            },
+            "apnet": {
+              "ASP_channel": 512,
+              "ASP_resblock_kernel_sizes": [3,7,11],
+              "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+              "ASP_input_conv_kernel_size": 7,
+              "ASP_output_conv_kernel_size": 7,
+
+              "PSP_channel": 512,
+              "PSP_resblock_kernel_sizes": [3,7,11],
+              "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 
+              "PSP_input_conv_kernel_size": 7,
+              "PSP_output_R_conv_kernel_size": 7,
+              "PSP_output_I_conv_kernel_size": 7,
+            }
+        },
+    },
+    "train": {
+        "fp16_run": true,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 16,
+        "lr_decay": 0.999875,
+        // "segment_size": 8192,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99
+            ],
+            "eps": 1e-9,
+        }
+    }
+}
diff --git a/egs/svc/README.md b/egs/svc/README.md
@@ -29,6 +29,6 @@ Until now, Amphion SVC has supported the following features and models:
   - Transformer-based models:
     - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
   - VAE- and Flow-based models:
-    - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
+    - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
 - **Waveform Synthesizers (Vocoders)**:
   - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
diff --git a/egs/svc/VitsSVC/README.md b/egs/svc/VitsSVC/README.md
@@ -0,0 +1,125 @@
+# VITS for Singing Voice Conversion
+
+This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
+
+There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+
+### Configuration
+
+Specify the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+
+## 2. Features Extraction
+
+### Content-based Pretrained Models Download
+
+By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+
+### Configuration
+
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+
+### Run
+
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+
+```bash
+sh egs/svc/VitsSVC/run.sh --stage 1
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+
+## 3. Training
+
+### Configuration
+
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+
+```bash
+sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+
+## 4. Inference/Conversion
+
+### Run
+
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+
+| Parameters                                          | Description                                                                                                                                                       | Example                                                                                                                                                                                                  |
+| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`                                  | The experimental directory which contains `checkpoint`                                                                                                            | `[Your path to save logs and checkpoints]/[YourExptName]`                                                                                                                                                |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                                                                                                                     | `[Your path to save logs and checkpoints]/[YourExptName]/result`                                                                                                                                         |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir).                                                                                                               | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`.                                                                                                                                      |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                                                                                                                         | `"autoshfit"` (by default), `3`, `-3`, etc.                                                                                                                                                              |
+
+For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
+
+```bash
+sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
+	--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+	--infer_source_audio_dir [Your Audios Folder] \
+	--infer_target_speaker "opencpop_female1" \
+	--infer_key_shift "autoshift"
+```
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,6 +42,7 @@ ckpts @@
     *.pt
     *.npy
     *.npz
+    !modules/whisper_extractor/assets/mel_filters.npz
     *.tar.gz
     *.ckpt
     *.wav
@@ Expand Down @@