diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb new file mode 100644 index 0000000000..41713295f2 --- /dev/null +++ b/notebooks/VITS_d-vector_multilingual_exemple.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "import IPython\n", + "import torch\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "from TTS.config import load_config\n", + "from TTS.tts.models import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.utils.audio import AudioProcessor" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", + "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", + "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", + "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", + "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", + "USE_CUDA = torch.cuda.is_available()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "speaker_embedding = None\n", + "\n", + "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", + "\n", + "model = setup_model(C)\n", + "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "\n", + "\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model = model.cuda()\n", + "\n", + "use_griffin_lim = True" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:16000\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:20\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:False\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:0\n", + " | > mel_fmax:None\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:45\n", + " | > do_sound_norm:False\n", + " | > do_amp_to_db_linear:False\n", + " | > do_amp_to_db_mel:True\n", + " | > stats_path:None\n", + " | > base:2.718281828459045\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + " > Using model: vits\n", + " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "#set speaker\n", + "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "model.language_manager.language_id_mapping" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'af': 0,\n", + " 'en': 1,\n", + " 'fr-fr': 2,\n", + " 'jv': 3,\n", + " 'pt-br': 4,\n", + " 'st': 5,\n", + " 'su': 6,\n", + " 'tn': 7,\n", + " 'xh': 8}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": { + "scrolled": true + } + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# set scales \n", + "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", + "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", + "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", + "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", + "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", + "language_id = 2\n", + "wav, alignment, _, _ = synthesis(\n", + " model,\n", + " text,\n", + " C,\n", + " \"cuda\" in str(next(model.parameters()).device),\n", + " ap,\n", + " speaker_id=None,\n", + " d_vector=d_vector,\n", + " style_wav=None,\n", + " language_id=language_id,\n", + " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", + " use_griffin_lim=True,\n", + " do_trim_silence=False,\n", + " ).values()\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {} + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.10 64-bit ('TTS': conda)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file