Skip to content

Commit

Permalink
Remove voice conversion limitation
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyTNT committed Aug 10, 2022
1 parent aff2fd6 commit 053dd0f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 13 deletions.
7 changes: 3 additions & 4 deletions MoeGoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,15 @@ def get_speaker_id(message):


elif choice == 'v':
wav_path = input('Path of a WAV file (22050 Hz, 16 bits, 1 channel) to convert:\n')
audio_path = input('Path of an audio file to convert:\n')
print_speakers(hps_ms.speakers)
audio, sampling_rate = utils.load_wav_to_torch(wav_path)
audio = utils.load_audio_to_torch(audio_path, hps_ms.data.sampling_rate)

originnal_id = get_speaker_id('Original speaker ID: ')
target_id = get_speaker_id('Target speaker ID: ')
out_path = input('Path to save: ')

y = audio / hps_ms.data.max_wav_value
y = y.unsqueeze(0)
y = audio.unsqueeze(0)

spec = spectrogram_torch(y, hps_ms.data.filter_length,
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
Expand Down
24 changes: 15 additions & 9 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import json
import subprocess
import numpy as np
from scipy.io.wavfile import read
import soundfile
import librosa
import torch

MATPLOTLIB_FLAG = False
Expand Down Expand Up @@ -53,7 +54,7 @@ def plot_spectrogram_to_numpy(spectrogram):
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np

fig, ax = plt.subplots(figsize=(10,2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
Expand Down Expand Up @@ -98,9 +99,14 @@ def plot_alignment_to_numpy(alignment, info=None):
return data


def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_audio_to_torch(full_path, target_sampling_rate):
audio, sampling_rate = soundfile.read(full_path, dtype=np.float32)
print(audio.shape)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != target_sampling_rate:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sampling_rate)
return torch.FloatTensor(audio.astype(np.float32))


def load_filepaths_and_text(filename, split="|"):
Expand All @@ -115,7 +121,7 @@ def get_hparams(init=True):
help='JSON file for configuration')
parser.add_argument('-m', '--model', type=str, required=True,
help='Model name')

args = parser.parse_args()
model_dir = os.path.join("./logs", args.model)

Expand All @@ -133,7 +139,7 @@ def get_hparams(init=True):
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)

hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
Expand Down Expand Up @@ -183,7 +189,7 @@ def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
Expand All @@ -200,7 +206,7 @@ def __init__(self, **kwargs):
if type(v) == dict:
v = HParams(**v)
self[k] = v

def keys(self):
return self.__dict__.keys()

Expand Down

0 comments on commit 053dd0f

Please sign in to comment.