Skip to content

Commit

Permalink
Merge pull request p0p4k#61 from choiHkk/features/training
Browse files Browse the repository at this point in the history
add vctk training pipeline
  • Loading branch information
p0p4k authored Oct 9, 2023
2 parents 4493317 + a71abaa commit 8aa26bc
Show file tree
Hide file tree
Showing 13 changed files with 582 additions and 59 deletions.
62 changes: 62 additions & 0 deletions configs/vits2_vctk_standard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"train": {
"log_interval": 200,
"eval_interval": 1000,
"seed": 1234,
"epochs": 10000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 64,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"use_mel_posterior_encoder": true,
"training_files":"filelists/vctk_audio_sid_text_train_filelist.txt.cleaned",
"validation_files":"filelists/vctk_audio_sid_text_val_filelist.txt.cleaned",
"text_cleaners":["english_cleaners3"],
"max_wav_value": 32768.0,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": false,
"n_speakers": 109,
"cleaned_text": true
},
"model": {
"use_mel_posterior_encoder": true,
"use_transformer_flows": true,
"transformer_flow_type": "pre_conv2",
"use_spk_conditioned_encoder": true,
"use_noise_scaled_mas": true,
"use_duration_discriminator": true,
"duration_discriminator_type": "dur_disc_2",
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [8,8,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"n_layers_q": 3,
"use_spectral_norm": false,
"use_sdp": true,
"gin_channels": 256
}
}
21 changes: 16 additions & 5 deletions data_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import time
import os
import random
import time

import numpy as np
import torch
import torch.utils.data

import commons
from mel_processing import spectrogram_torch, mel_spectrogram_torch, spec_to_mel_torch
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence, cleaned_text_to_sequence
from mel_processing import (mel_spectrogram_torch, spec_to_mel_torch,
spectrogram_torch)
from text import cleaned_text_to_sequence, text_to_sequence
from utils import load_filepaths_and_text, load_wav_to_torch


class TextAudioLoader(torch.utils.data.Dataset):
Expand Down Expand Up @@ -233,6 +235,7 @@ def __init__(self, audiopaths_sid_text, hparams):
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
self.min_audio_len = getattr(hparams, "min_audio_len", 8192)

random.seed(1234)
random.shuffle(self.audiopaths_sid_text)
Expand All @@ -249,11 +252,19 @@ def _filter(self):
audiopaths_sid_text_new = []
lengths = []
for audiopath, sid, text in self.audiopaths_sid_text:
if not os.path.isfile(audiopath):
continue
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_sid_text_new.append([audiopath, sid, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
length = os.path.getsize(audiopath) // (2 * self.hop_length)
if length < self.min_audio_len // self.hop_length:
continue
lengths.append(length)
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
print(
len(self.lengths)
) # if we use large corpus dataset, we can check how much time it takes.

def get_audio_text_speaker_pair(self, audiopath_sid_text):
# separate filename, speaker_id and text
Expand Down
15 changes: 8 additions & 7 deletions infer_onnx.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import torch
import onnxruntime
import numpy as np
import argparse

import numpy as np
import onnxruntime
import torch
from scipy.io.wavfile import write

import commons
import utils
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
Expand All @@ -28,18 +28,19 @@ def main() -> None:
"--output-wav-path", required=True, help="Path to write WAV file"
)
parser.add_argument("--text", required=True, type=str, help="Text to synthesize")
parser.add_argument("--sid", required=False, type=int, help="Speaker ID to synthesize")
args = parser.parse_args()

sess_options = onnxruntime.SessionOptions()
model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options)
model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options, providers=["CPUExecutionProvider"])

hps = utils.get_hparams_from_file(args.config_path)

phoneme_ids = get_text(args.text, hps)
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array([0.667, 1.0, 0.8], dtype=np.float32)
sid = None
sid = np.array([int(args.sid)]) if args.sid is not None else None

audio = model.run(
None,
Expand Down
Loading

0 comments on commit 8aa26bc

Please sign in to comment.