diff --git a/README.md b/README.md index 640021b..794a99f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ Singing Voice Conversion via diffusion model ## updates: ->2022.11.23 修复了一个重大bug,曾导致可能将用于推理的原始gt音频转变采样率为22.05kHz,对于由此造成的影响我们表示十分抱歉,请务必检查自己的测试音频,并使用更新后的代码\ +>2022.11.28 增加了默认打开的no_fs2选项,可优化部分网络,提升训练速度、缩减模型体积,对于未来新训练的模型有效\ +2022.11.23 修复了一个重大bug,曾导致可能将用于推理的原始gt音频转变采样率为22.05kHz,对于由此造成的影响我们表示十分抱歉,请务必检查自己的测试音频,并使用更新后的代码\ 2022.11.22 修复了很多bug,其中有几个影响推理效果重大的bug\ 2022.11.20 增加对推理时多数格式的输入和保存,无需手动借助其他软件转换\ 2022.11.13 修正中断后读取模型的epoch/steps显示问题,添加f0处理的磁盘缓存,添加实时变声推理的支持文件\ diff --git a/infer.py b/infer.py index 0d4f3b2..b5ceecc 100644 --- a/infer.py +++ b/infer.py @@ -42,7 +42,7 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise f0_tst = [] f0_pred = [] audio = [] - epsilon = 0.0002 + epsilon = 0.00002 for data in audio_data: print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======') length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate'])) diff --git a/modules/fastspeech/fs2.py b/modules/fastspeech/fs2.py index 1dce7bf..e718e00 100644 --- a/modules/fastspeech/fs2.py +++ b/modules/fastspeech/fs2.py @@ -23,12 +23,13 @@ def __init__(self, dictionary, out_dims=None): super().__init__() # self.dictionary = dictionary self.padding_idx = 0 - self.enc_layers = hparams['enc_layers'] - self.dec_layers = hparams['dec_layers'] + if hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True: + self.enc_layers = hparams['enc_layers'] + self.dec_layers = hparams['dec_layers'] + self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams) + self.decoder = FS_DECODERS[hparams['decoder_type']](hparams) self.hidden_size = hparams['hidden_size'] # self.encoder_embed_tokens = self.build_embedding(self.dictionary, self.hidden_size) - self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams) - self.decoder = FS_DECODERS[hparams['decoder_type']](hparams) self.out_dims = out_dims if out_dims is None: self.out_dims = hparams['audio_num_mel_bins'] @@ -94,7 +95,10 @@ def forward(self, hubert, mel2ph=None, spk_embed=None, ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=True, spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs): ret = {} - encoder_out =self.encoder(hubert) # [B, T, C] + if hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True: + encoder_out =self.encoder(hubert) # [B, T, C] + else: + encoder_out =hubert src_nonpadding = (hubert!=0).any(-1)[:,:,None] # add ref style embed @@ -142,10 +146,10 @@ def forward(self, hubert, mel2ph=None, spk_embed=None, decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret) ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding - - if skip_decoder: - return ret - ret['mel_out'] = self.run_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) + if hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True: + if skip_decoder: + return ret + ret['mel_out'] = self.run_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) return ret diff --git a/training/config.yaml b/training/config.yaml index 5030a2b..e8e3bfd 100644 --- a/training/config.yaml +++ b/training/config.yaml @@ -346,3 +346,4 @@ wav2spec_eps: 1e-6 weight_decay: 0 win_size: 512 work_dir: checkpoints/atri +no_fs2: true diff --git a/training/config_nsf.yaml b/training/config_nsf.yaml index 5fda5a6..c4a2e71 100644 --- a/training/config_nsf.yaml +++ b/training/config_nsf.yaml @@ -188,3 +188,4 @@ wav2spec_eps: 1e-6 weight_decay: 0 win_size: 2048 work_dir: checkpoints/nyaru +no_fs2: true