Remove voice conversion limitation

jiyidesueipian · Aug 10, 2022 · 053dd0f · 053dd0f
1 parent aff2fd6
commit 053dd0f
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 13 deletions.
diff --git a/MoeGoe.py b/MoeGoe.py
@@ -89,16 +89,15 @@ def get_speaker_id(message):
 
 
         elif choice == 'v':
-            wav_path = input('Path of a WAV file (22050 Hz, 16 bits, 1 channel) to convert:\n')
+            audio_path = input('Path of an audio file to convert:\n')
             print_speakers(hps_ms.speakers)
-            audio, sampling_rate = utils.load_wav_to_torch(wav_path)
+            audio = utils.load_audio_to_torch(audio_path, hps_ms.data.sampling_rate)
 
             originnal_id = get_speaker_id('Original speaker ID: ')
             target_id = get_speaker_id('Target speaker ID: ')
             out_path = input('Path to save: ')
 
-            y = audio / hps_ms.data.max_wav_value
-            y = y.unsqueeze(0)
+            y = audio.unsqueeze(0)
 
             spec = spectrogram_torch(y, hps_ms.data.filter_length,
                 hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,

diff --git a/utils.py b/utils.py
@@ -6,7 +6,8 @@
 import json
 import subprocess
 import numpy as np
-from scipy.io.wavfile import read
+import soundfile
+import librosa
 import torch
 
 MATPLOTLIB_FLAG = False
@@ -53,7 +54,7 @@ def plot_spectrogram_to_numpy(spectrogram):
     mpl_logger.setLevel(logging.WARNING)
   import matplotlib.pylab as plt
   import numpy as np
-  
+
   fig, ax = plt.subplots(figsize=(10,2))
   im = ax.imshow(spectrogram, aspect="auto", origin="lower",
                   interpolation='none')
@@ -98,9 +99,14 @@ def plot_alignment_to_numpy(alignment, info=None):
   return data
 
 
-def load_wav_to_torch(full_path):
-  sampling_rate, data = read(full_path)
-  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+def load_audio_to_torch(full_path, target_sampling_rate):
+  audio, sampling_rate = soundfile.read(full_path, dtype=np.float32)
+  print(audio.shape)
+  if len(audio.shape) > 1:
+    audio = librosa.to_mono(audio.transpose(1, 0))
+  if sampling_rate != target_sampling_rate:
+    audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sampling_rate)
+  return torch.FloatTensor(audio.astype(np.float32))
 
 
 def load_filepaths_and_text(filename, split="|"):
@@ -115,7 +121,7 @@ def get_hparams(init=True):
                       help='JSON file for configuration')
   parser.add_argument('-m', '--model', type=str, required=True,
                       help='Model name')
-  
+
   args = parser.parse_args()
   model_dir = os.path.join("./logs", args.model)
 
@@ -133,7 +139,7 @@ def get_hparams(init=True):
     with open(config_save_path, "r") as f:
       data = f.read()
   config = json.loads(data)
-  
+
   hparams = HParams(**config)
   hparams.model_dir = model_dir
   return hparams
@@ -183,7 +189,7 @@ def get_logger(model_dir, filename="train.log"):
   global logger
   logger = logging.getLogger(os.path.basename(model_dir))
   logger.setLevel(logging.DEBUG)
-  
+
   formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
   if not os.path.exists(model_dir):
     os.makedirs(model_dir)
@@ -200,7 +206,7 @@ def __init__(self, **kwargs):
       if type(v) == dict:
         v = HParams(**v)
       self[k] = v
-    
+
   def keys(self):
     return self.__dict__.keys()