forked from open-mmlab/Amphion
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VitsSVC implementation (open-mmlab#14)
Add the VAE- and Flow-based model: VitsSVC, which is implemented like so-vits-svc
- Loading branch information
1 parent
ea507af
commit 554b791
Showing
16 changed files
with
1,349 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,7 @@ ckpts | |
*.pt | ||
*.npy | ||
*.npz | ||
!modules/whisper_extractor/assets/mel_filters.npz | ||
*.tar.gz | ||
*.ckpt | ||
*.wav | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
{ | ||
"base_config": "config/base.json", | ||
"model_type": "VITS", | ||
"task_type": "svc", | ||
"preprocess": { | ||
"extract_phone": false, | ||
"extract_mel": true, | ||
"extract_linear_spec": true, | ||
"extract_audio": true, | ||
"use_linear": true, | ||
"use_mel": true, | ||
"use_audio": true, | ||
"use_text": false, | ||
"use_phone": true, | ||
|
||
"fmin": 0, | ||
"fmax": null, | ||
"f0_min": 50, | ||
"f0_max": 1100, | ||
// f0_bin in sovits | ||
"pitch_bin": 256, | ||
// filter_length in sovits | ||
"n_fft": 2048, | ||
// hop_length in sovits | ||
"hop_size": 512, | ||
// win_length in sovits | ||
"win_size": 2048, | ||
"segment_size": 8192, | ||
"n_mel": 100, | ||
"sample_rate": 44100, | ||
|
||
"mel_min_max_stats_dir": "mel_min_max_stats", | ||
"whisper_dir": "whisper", | ||
"contentvec_dir": "contentvec", | ||
"wenet_dir": "wenet", | ||
"mert_dir": "mert", | ||
}, | ||
"model": { | ||
"condition_encoder": { | ||
"merge_mode": "add", | ||
"input_melody_dim": 1, | ||
"use_log_f0": true, | ||
"n_bins_melody": 256, | ||
//# Quantization (0 for not quantization) | ||
"output_melody_dim": 196, | ||
"input_loudness_dim": 1, | ||
"use_log_loudness": false, | ||
"n_bins_loudness": 256, | ||
"output_loudness_dim": 196, | ||
"use_whisper": false, | ||
"use_contentvec": false, | ||
"use_wenet": false, | ||
"use_mert": false, | ||
"whisper_dim": 1024, | ||
"contentvec_dim": 256, | ||
"mert_dim": 256, | ||
"wenet_dim": 512, | ||
"content_encoder_dim": 196, | ||
"output_singer_dim": 196, | ||
"singer_table_size": 512, | ||
"output_content_dim": 196, | ||
"use_spkid": true | ||
}, | ||
"vits": { | ||
"filter_channels": 256, | ||
"gin_channels": 256, | ||
"hidden_channels": 192, | ||
"inter_channels": 192, | ||
"kernel_size": 3, | ||
"n_flow_layer": 4, | ||
"n_heads": 2, | ||
"n_layers": 6, | ||
"n_layers_q": 3, | ||
"n_speakers": 512, | ||
"p_dropout": 0.1, | ||
"ssl_dim": 256, | ||
"use_spectral_norm": false, | ||
}, | ||
"generator": "hifigan", | ||
"generator_config": { | ||
"hifigan": { | ||
"resblock": "1", | ||
"resblock_kernel_sizes": [ | ||
3, | ||
7, | ||
11 | ||
], | ||
"upsample_rates": [ | ||
8,8,2,2,2 | ||
], | ||
"upsample_kernel_sizes": [ | ||
16,16,4,4,4 | ||
], | ||
"upsample_initial_channel": 512, | ||
"resblock_dilation_sizes": [ | ||
[1,3,5], | ||
[1,3,5], | ||
[1,3,5] | ||
] | ||
}, | ||
"melgan": { | ||
"ratios": [8, 8, 2, 2, 2], | ||
"ngf": 32, | ||
"n_residual_layers": 3, | ||
"num_D": 3, | ||
"ndf": 16, | ||
"n_layers": 4, | ||
"downsampling_factor": 4 | ||
}, | ||
"bigvgan": { | ||
"resblock": "1", | ||
"activation": "snakebeta", | ||
"snake_logscale": true, | ||
"upsample_rates": [ | ||
8,8,2,2,2, | ||
], | ||
"upsample_kernel_sizes": [ | ||
16,16,4,4,4, | ||
], | ||
"upsample_initial_channel": 512, | ||
"resblock_kernel_sizes": [ | ||
3, | ||
7, | ||
11 | ||
], | ||
"resblock_dilation_sizes": [ | ||
[1,3,5], | ||
[1,3,5], | ||
[1,3,5] | ||
] | ||
}, | ||
"nsfhifigan": { | ||
"resblock": "1", | ||
"harmonic_num": 8, | ||
"upsample_rates": [ | ||
8,8,2,2,2, | ||
], | ||
"upsample_kernel_sizes": [ | ||
16,16,4,4,4, | ||
], | ||
"upsample_initial_channel": 768, | ||
"resblock_kernel_sizes": [ | ||
3, | ||
7, | ||
11 | ||
], | ||
"resblock_dilation_sizes": [ | ||
[1,3,5], | ||
[1,3,5], | ||
[1,3,5] | ||
] | ||
}, | ||
"apnet": { | ||
"ASP_channel": 512, | ||
"ASP_resblock_kernel_sizes": [3,7,11], | ||
"ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], | ||
"ASP_input_conv_kernel_size": 7, | ||
"ASP_output_conv_kernel_size": 7, | ||
|
||
"PSP_channel": 512, | ||
"PSP_resblock_kernel_sizes": [3,7,11], | ||
"PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], | ||
"PSP_input_conv_kernel_size": 7, | ||
"PSP_output_R_conv_kernel_size": 7, | ||
"PSP_output_I_conv_kernel_size": 7, | ||
} | ||
}, | ||
}, | ||
"train": { | ||
"fp16_run": true, | ||
"learning_rate": 2e-4, | ||
"betas": [ | ||
0.8, | ||
0.99 | ||
], | ||
"eps": 1e-9, | ||
"batch_size": 16, | ||
"lr_decay": 0.999875, | ||
// "segment_size": 8192, | ||
"init_lr_ratio": 1, | ||
"warmup_epochs": 0, | ||
"c_mel": 45, | ||
"c_kl": 1.0, | ||
"AdamW": { | ||
"betas": [ | ||
0.8, | ||
0.99 | ||
], | ||
"eps": 1e-9, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
# VITS for Singing Voice Conversion | ||
|
||
This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation. | ||
|
||
There are four stages in total: | ||
|
||
1. Data preparation | ||
2. Features extraction | ||
3. Training | ||
4. Inference/conversion | ||
|
||
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: | ||
> ```bash | ||
> cd Amphion | ||
> ``` | ||
## 1. Data Preparation | ||
### Dataset Download | ||
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). | ||
### Configuration | ||
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. | ||
```json | ||
"dataset": [ | ||
"m4singer", | ||
"opencpop", | ||
"opensinger", | ||
"svcc", | ||
"vctk" | ||
], | ||
"dataset_path": { | ||
// TODO: Fill in your dataset path | ||
"m4singer": "[M4Singer dataset path]", | ||
"opencpop": "[Opencpop dataset path]", | ||
"opensinger": "[OpenSinger dataset path]", | ||
"svcc": "[SVCC dataset path]", | ||
"vctk": "[VCTK dataset path]" | ||
}, | ||
``` | ||
## 2. Features Extraction | ||
|
||
### Content-based Pretrained Models Download | ||
|
||
By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md). | ||
|
||
### Configuration | ||
|
||
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: | ||
|
||
```json | ||
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" | ||
"log_dir": "ckpts/svc", | ||
"preprocess": { | ||
// TODO: Fill in the output data path. The default value is "Amphion/data" | ||
"processed_dir": "data", | ||
... | ||
}, | ||
``` | ||
|
||
### Run | ||
|
||
Run the `run.sh` as the preproces stage (set `--stage 1`). | ||
|
||
```bash | ||
sh egs/svc/VitsSVC/run.sh --stage 1 | ||
``` | ||
|
||
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. | ||
## 3. Training | ||
|
||
### Configuration | ||
|
||
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. | ||
|
||
```json | ||
"train": { | ||
"batch_size": 32, | ||
... | ||
"adamw": { | ||
"lr": 2.0e-4 | ||
}, | ||
... | ||
} | ||
``` | ||
|
||
### Run | ||
|
||
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. | ||
|
||
```bash | ||
sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName] | ||
``` | ||
|
||
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. | ||
## 4. Inference/Conversion | ||
|
||
### Run | ||
|
||
For inference/conversion, you need to specify the following configurations when running `run.sh`: | ||
|
||
| Parameters | Description | Example | | ||
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | | ||
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | | ||
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | | ||
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | | ||
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | | ||
|
||
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: | ||
|
||
```bash | ||
sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \ | ||
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ | ||
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ | ||
--infer_source_audio_dir [Your Audios Folder] \ | ||
--infer_target_speaker "opencpop_female1" \ | ||
--infer_key_shift "autoshift" | ||
``` |
Oops, something went wrong.