Skip to content

Commit

Permalink
Update CheckSpectrograms notebook (coqui-ai#1418)
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol authored Mar 18, 2022
1 parent c7f9ec0 commit 2e6e8f6
Showing 1 changed file with 124 additions and 94 deletions.
218 changes: 124 additions & 94 deletions notebooks/dataset_analysis/CheckSpectrograms.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
Expand All @@ -12,21 +16,51 @@
"\n",
"import IPython.display as ipd\n",
"import glob"
],
"outputs": [],
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
}
},
"outputs": [],
"source": [
"from TTS.config.shared_configs import BaseAudioConfig\n",
"CONFIG = BaseAudioConfig()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ✍️ Set these values "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
"data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
"\n",
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
"CONFIG = load_config(config_path)\n",
"data_path = \"/root/wav48_silence_trimmed/\"\n",
"file_ext = \".flac\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read audio files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
"\n",
"# Change this to the index of the desired file listed below\n",
"sample_file_index = 10\n",
Expand All @@ -35,44 +69,45 @@
"\n",
"print(\"File list, by index:\")\n",
"dict(enumerate(file_paths))"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Setup Audio Processor\n",
"## ✍️ Set Audio Processor\n",
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
"\n",
"The default values are loaded from your config.json file, so you only need to\n",
"uncomment and modify values below that you'd like to tune."
],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"tune_params={\n",
"# 'audio_processor': 'audio',\n",
"# 'num_mels': 80, # In general, you don't need to change this. \n",
"# 'fft_size': 1024, # In general, you don't need to change this.\n",
"# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don't need to change this.\n",
"# 'win_length': 1024, # In general, you don't need to change this.\n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
"# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
"# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
"# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
" 'num_mels': 80, # In general, you don't need to change this. \n",
" 'fft_size': 2400, # In general, you don't need to change this.\n",
" 'frame_length_ms': 50, \n",
" 'frame_shift_ms': 12.5,\n",
" 'sample_rate': 48000, # This must match the sample rate of the dataset.\n",
" 'hop_length': None, # In general, you don't need to change this.\n",
" 'win_length': 1024, # In general, you don't need to change this.\n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
" 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
" 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
" 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
" 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"}\n",
"\n",
"# These options have to be forced off in order to avoid errors about the \n",
Expand All @@ -86,59 +121,57 @@
"}\n",
"\n",
"# Override select parts of loaded config with parameters above\n",
"tuned_config = CONFIG.audio.copy()\n",
"tuned_config = CONFIG.copy()\n",
"tuned_config.update(reset)\n",
"tuned_config.update(tune_params)\n",
"\n",
"AP = AudioProcessor(**tuned_config);"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Check audio loading "
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Check audio loading "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
"ipd.Audio(data=wav, rate=AP.sample_rate) "
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Generate Mel-Spectrogram and Re-synthesis with GL"
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Generate Mel-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"AP.power = 1.5"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mel = AP.melspectrogram(wav)\n",
"print(\"Max:\", mel.max())\n",
Expand All @@ -148,24 +181,24 @@
"\n",
"wav_gen = AP.inv_melspectrogram(mel)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Generate Linear-Spectrogram and Re-synthesis with GL"
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Generate Linear-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"spec = AP.spectrogram(wav)\n",
"print(\"Max:\", spec.max())\n",
Expand All @@ -175,26 +208,26 @@
"\n",
"wav_gen = AP.inv_spectrogram(spec)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Compare values for a certain parameter\n",
"\n",
"Optimize your parameters by comparing different values per parameter at a time."
],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"from librosa import display\n",
"from matplotlib import pylab as plt\n",
Expand Down Expand Up @@ -234,39 +267,39 @@
" val = values[idx]\n",
" print(\" > {} = {}\".format(attribute, val))\n",
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
},
"outputs": [],
"source": [
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
},
"outputs": [],
"source": [
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
]
}
],
"metadata": {
"interpreter": {
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.5 64-bit ('torch': conda)"
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -278,12 +311,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"interpreter": {
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
"version": "3.9.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}

0 comments on commit 2e6e8f6

Please sign in to comment.