Enforce phonemizer definition for synthesis (coqui-ai#1441)

* Enforce phonemizer definition for synthesis * Fix train_tts, tokenizer init can now edit config * Add small change to trigger CI pipeline * fix wrong output path for one tts_test * Fix style * Test config overides by args and tokenizer * Fix style
imandana · Mar 25, 2022 · c66a624 · c66a624
1 parent 37896e1
commit c66a624
Show file tree

Hide file tree

Showing 19 changed files with 133 additions and 58 deletions.
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
@@ -57,7 +57,7 @@ def main():
     # init the trainer and 🚀
     trainer = Trainer(
         train_args,
-        config,
+        model.config,
         config.output_path,
         model=model,
         train_samples=train_samples,

diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
@@ -191,6 +191,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
                     phonemizer = get_phonemizer_by_name(
                         DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
                     )
+                    new_config.phonemizer = phonemizer.name()
                 except KeyError as e:
                     raise ValueError(
                         f"""No phonemizer found for language {config.phoneme_language}.

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
@@ -112,6 +112,9 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
         self.use_phonemes = self.tts_config.use_phonemes
         self.tts_model = setup_tts_model(config=self.tts_config)
 
+        if self.use_phonemes and self.tts_config["phonemizer"] is None:
+            raise ValueError("Phonemizer is not defined in the TTS config.")
+
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()
 

diff --git a/requirements.txt b/requirements.txt
@@ -25,7 +25,7 @@ tensorboardX
 pyworld
 # coqui stack
 coqui-trainer
-coqpit                                          # config managemenr
+coqpit # config management
 # chinese g2p deps
 jieba
 pypinyin

diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -42,7 +43,7 @@
     "--coqpit.datasets.0.meta_file_train metadata.csv "
     "--coqpit.datasets.0.meta_file_val metadata.csv "
     "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs -1"
+    "--coqpit.test_delay_epochs 0 "
 )
 run_cli(command_train)
 
@@ -54,6 +55,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -74,6 +75,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -73,6 +74,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -61,6 +62,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = config.d_vector_file
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -58,6 +59,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -55,6 +56,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -61,6 +62,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = config.d_vector_file
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -59,6 +60,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_tacotron2_train_fsspec_path.py b/tests/tts_tests/test_tacotron2_train_fsspec_path.py
diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -92,6 +93,14 @@
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 continue_languages_path = os.path.join(continue_path, "language_ids.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -99,6 +100,14 @@
 continue_speakers_path = config.d_vector_file
 continue_languages_path = os.path.join(continue_path, "language_ids.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -65,6 +66,14 @@
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)