Skip to content

Commit

Permalink
Add VitsSVC implementation (open-mmlab#14)
Browse files Browse the repository at this point in the history
Add the VAE- and Flow-based model: VitsSVC, which is implemented like so-vits-svc
  • Loading branch information
viewfinder-annn authored Dec 8, 2023
1 parent ea507af commit 554b791
Show file tree
Hide file tree
Showing 16 changed files with 1,349 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ ckpts
*.pt
*.npy
*.npz
!modules/whisper_extractor/assets/mel_filters.npz
*.tar.gz
*.ckpt
*.wav
Expand Down
9 changes: 6 additions & 3 deletions bins/svc/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from models.svc.diffusion.diffusion_inference import DiffusionInference
from models.svc.comosvc.comosvc_inference import ComoSVCInference
from models.svc.transformer.transformer_inference import TransformerInference
from models.svc.vits.vits_inference import VitsInference
from utils.util import load_config
from utils.audio_slicer import split_audio, merge_segments_encodec
from processors import acoustic_extractor, content_extractor
Expand All @@ -24,6 +25,7 @@ def build_inference(args, cfg, infer_type="from_dataset"):
"DiffWaveNetSVC": DiffusionInference,
"DiffComoSVC": ComoSVCInference,
"TransformerSVC": TransformerInference,
"VitsSVC": VitsInference,
}

inference_class = supported_inference[cfg.model_type]
Expand All @@ -48,9 +50,10 @@ def prepare_for_audio_file(args, cfg, num_workers=1):
acoustic_extractor.extract_utt_acoustic_features_serial(
metadata, temp_audio_dir, cfg
)
acoustic_extractor.cal_mel_min_max(
dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
)
if cfg.preprocess.use_min_max_norm_mel == True:
acoustic_extractor.cal_mel_min_max(
dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
)
acoustic_extractor.cal_pitch_statistics_svc(
dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
)
Expand Down
2 changes: 2 additions & 0 deletions bins/svc/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
from models.svc.transformer.transformer_trainer import TransformerTrainer
from models.svc.vits.vits_trainer import VitsSVCTrainer
from utils.util import load_config


Expand All @@ -18,6 +19,7 @@ def build_trainer(args, cfg):
"DiffWaveNetSVC": DiffusionTrainer,
"DiffComoSVC": ComoSVCTrainer,
"TransformerSVC": TransformerTrainer,
"VitsSVC": VitsSVCTrainer,
}

trainer_class = supported_trainer[cfg.model_type]
Expand Down
192 changes: 192 additions & 0 deletions config/vitssvc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
{
"base_config": "config/base.json",
"model_type": "VITS",
"task_type": "svc",
"preprocess": {
"extract_phone": false,
"extract_mel": true,
"extract_linear_spec": true,
"extract_audio": true,
"use_linear": true,
"use_mel": true,
"use_audio": true,
"use_text": false,
"use_phone": true,

"fmin": 0,
"fmax": null,
"f0_min": 50,
"f0_max": 1100,
// f0_bin in sovits
"pitch_bin": 256,
// filter_length in sovits
"n_fft": 2048,
// hop_length in sovits
"hop_size": 512,
// win_length in sovits
"win_size": 2048,
"segment_size": 8192,
"n_mel": 100,
"sample_rate": 44100,

"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"contentvec_dir": "contentvec",
"wenet_dir": "wenet",
"mert_dir": "mert",
},
"model": {
"condition_encoder": {
"merge_mode": "add",
"input_melody_dim": 1,
"use_log_f0": true,
"n_bins_melody": 256,
//# Quantization (0 for not quantization)
"output_melody_dim": 196,
"input_loudness_dim": 1,
"use_log_loudness": false,
"n_bins_loudness": 256,
"output_loudness_dim": 196,
"use_whisper": false,
"use_contentvec": false,
"use_wenet": false,
"use_mert": false,
"whisper_dim": 1024,
"contentvec_dim": 256,
"mert_dim": 256,
"wenet_dim": 512,
"content_encoder_dim": 196,
"output_singer_dim": 196,
"singer_table_size": 512,
"output_content_dim": 196,
"use_spkid": true
},
"vits": {
"filter_channels": 256,
"gin_channels": 256,
"hidden_channels": 192,
"inter_channels": 192,
"kernel_size": 3,
"n_flow_layer": 4,
"n_heads": 2,
"n_layers": 6,
"n_layers_q": 3,
"n_speakers": 512,
"p_dropout": 0.1,
"ssl_dim": 256,
"use_spectral_norm": false,
},
"generator": "hifigan",
"generator_config": {
"hifigan": {
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"upsample_rates": [
8,8,2,2,2
],
"upsample_kernel_sizes": [
16,16,4,4,4
],
"upsample_initial_channel": 512,
"resblock_dilation_sizes": [
[1,3,5],
[1,3,5],
[1,3,5]
]
},
"melgan": {
"ratios": [8, 8, 2, 2, 2],
"ngf": 32,
"n_residual_layers": 3,
"num_D": 3,
"ndf": 16,
"n_layers": 4,
"downsampling_factor": 4
},
"bigvgan": {
"resblock": "1",
"activation": "snakebeta",
"snake_logscale": true,
"upsample_rates": [
8,8,2,2,2,
],
"upsample_kernel_sizes": [
16,16,4,4,4,
],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[1,3,5],
[1,3,5],
[1,3,5]
]
},
"nsfhifigan": {
"resblock": "1",
"harmonic_num": 8,
"upsample_rates": [
8,8,2,2,2,
],
"upsample_kernel_sizes": [
16,16,4,4,4,
],
"upsample_initial_channel": 768,
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[1,3,5],
[1,3,5],
[1,3,5]
]
},
"apnet": {
"ASP_channel": 512,
"ASP_resblock_kernel_sizes": [3,7,11],
"ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"ASP_input_conv_kernel_size": 7,
"ASP_output_conv_kernel_size": 7,

"PSP_channel": 512,
"PSP_resblock_kernel_sizes": [3,7,11],
"PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"PSP_input_conv_kernel_size": 7,
"PSP_output_R_conv_kernel_size": 7,
"PSP_output_I_conv_kernel_size": 7,
}
},
},
"train": {
"fp16_run": true,
"learning_rate": 2e-4,
"betas": [
0.8,
0.99
],
"eps": 1e-9,
"batch_size": 16,
"lr_decay": 0.999875,
// "segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"AdamW": {
"betas": [
0.8,
0.99
],
"eps": 1e-9,
}
}
}
2 changes: 1 addition & 1 deletion egs/svc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ Until now, Amphion SVC has supported the following features and models:
- Transformer-based models:
- **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
- VAE- and Flow-based models:
- **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
- **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
- **Waveform Synthesizers (Vocoders)**:
- The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
125 changes: 125 additions & 0 deletions egs/svc/VitsSVC/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# VITS for Singing Voice Conversion

This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.

There are four stages in total:

1. Data preparation
2. Features extraction
3. Training
4. Inference/conversion

> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
> ```bash
> cd Amphion
> ```
## 1. Data Preparation
### Dataset Download
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
### Configuration
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
```json
"dataset": [
"m4singer",
"opencpop",
"opensinger",
"svcc",
"vctk"
],
"dataset_path": {
// TODO: Fill in your dataset path
"m4singer": "[M4Singer dataset path]",
"opencpop": "[Opencpop dataset path]",
"opensinger": "[OpenSinger dataset path]",
"svcc": "[SVCC dataset path]",
"vctk": "[VCTK dataset path]"
},
```
## 2. Features Extraction

### Content-based Pretrained Models Download

By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).

### Configuration

Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:

```json
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
"log_dir": "ckpts/svc",
"preprocess": {
// TODO: Fill in the output data path. The default value is "Amphion/data"
"processed_dir": "data",
...
},
```

### Run

Run the `run.sh` as the preproces stage (set `--stage 1`).

```bash
sh egs/svc/VitsSVC/run.sh --stage 1
```

> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
## 3. Training

### Configuration

We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.

```json
"train": {
"batch_size": 32,
...
"adamw": {
"lr": 2.0e-4
},
...
}
```

### Run

Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.

```bash
sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
```

> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
## 4. Inference/Conversion

### Run

For inference/conversion, you need to specify the following configurations when running `run.sh`:

| Parameters | Description | Example |
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |

For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:

```bash
sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
--infer_source_audio_dir [Your Audios Folder] \
--infer_target_speaker "opencpop_female1" \
--infer_key_shift "autoshift"
```
Loading

0 comments on commit 554b791

Please sign in to comment.