diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb
new file mode 100644
index 0000000000..41713295f2
--- /dev/null
+++ b/notebooks/VITS_d-vector_multilingual_exemple.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "source": [
+ "import IPython\n",
+ "import torch\n",
+ "\n",
+ "from IPython.display import Audio\n",
+ "\n",
+ "from TTS.config import load_config\n",
+ "from TTS.tts.models import setup_model\n",
+ "from TTS.tts.utils.synthesis import synthesis\n",
+ "from TTS.utils.audio import AudioProcessor"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "source": [
+ "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n",
+ "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n",
+ "CONFIG_PATH = GENERAL_PATH + 'config.json'\n",
+ "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n",
+ "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n",
+ "USE_CUDA = torch.cuda.is_available()"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "source": [
+ "# load the config\n",
+ "C = load_config(CONFIG_PATH)\n",
+ "\n",
+ "# load the audio processor\n",
+ "ap = AudioProcessor(**C.audio)\n",
+ "\n",
+ "speaker_embedding = None\n",
+ "\n",
+ "C.model_args['d_vector_file'] = TTS_SPEAKERS\n",
+ "\n",
+ "model = setup_model(C)\n",
+ "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n",
+ "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
+ "model.load_state_dict(cp['model'])\n",
+ "\n",
+ "\n",
+ "model.eval()\n",
+ "\n",
+ "if USE_CUDA:\n",
+ " model = model.cuda()\n",
+ "\n",
+ "use_griffin_lim = True"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " > Setting up Audio Processor...\n",
+ " | > sample_rate:16000\n",
+ " | > resample:False\n",
+ " | > num_mels:80\n",
+ " | > min_level_db:-100\n",
+ " | > frame_shift_ms:None\n",
+ " | > frame_length_ms:None\n",
+ " | > ref_level_db:20\n",
+ " | > fft_size:1024\n",
+ " | > power:1.5\n",
+ " | > preemphasis:0.0\n",
+ " | > griffin_lim_iters:60\n",
+ " | > signal_norm:False\n",
+ " | > symmetric_norm:True\n",
+ " | > mel_fmin:0\n",
+ " | > mel_fmax:None\n",
+ " | > spec_gain:1.0\n",
+ " | > stft_pad_mode:reflect\n",
+ " | > max_norm:4.0\n",
+ " | > clip_norm:True\n",
+ " | > do_trim_silence:True\n",
+ " | > trim_db:45\n",
+ " | > do_sound_norm:False\n",
+ " | > do_amp_to_db_linear:False\n",
+ " | > do_amp_to_db_mel:True\n",
+ " | > stats_path:None\n",
+ " | > base:2.718281828459045\n",
+ " | > hop_length:256\n",
+ " | > win_length:1024\n",
+ " > Using model: vits\n",
+ " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n"
+ ]
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "source": [
+ "#set speaker\n",
+ "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "source": [
+ "model.language_manager.language_id_mapping"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{'af': 0,\n",
+ " 'en': 1,\n",
+ " 'fr-fr': 2,\n",
+ " 'jv': 3,\n",
+ " 'pt-br': 4,\n",
+ " 'st': 5,\n",
+ " 'su': 6,\n",
+ " 'tn': 7,\n",
+ " 'xh': 8}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
+ "metadata": {
+ "scrolled": true
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "source": [
+ "# set scales \n",
+ "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n",
+ "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n",
+ "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n",
+ "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n",
+ "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference."
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "source": [
+ "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n",
+ "language_id = 2\n",
+ "wav, alignment, _, _ = synthesis(\n",
+ " model,\n",
+ " text,\n",
+ " C,\n",
+ " \"cuda\" in str(next(model.parameters()).device),\n",
+ " ap,\n",
+ " speaker_id=None,\n",
+ " d_vector=d_vector,\n",
+ " style_wav=None,\n",
+ " language_id=language_id,\n",
+ " enable_eos_bos_chars=C.enable_eos_bos_chars,\n",
+ " use_griffin_lim=True,\n",
+ " do_trim_silence=False,\n",
+ " ).values()\n",
+ "IPython.display.display(Audio(wav, rate=ap.sample_rate))"
+ ],
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ],
+ "metadata": {}
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3.8.10 64-bit ('TTS': conda)"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file