Skip to content

Commit

Permalink
allow using opus archive and not resampling
Browse files Browse the repository at this point in the history
Signed-off-by: Elena Rastorgueva <[email protected]>
  • Loading branch information
erastorgueva-nv committed Jul 17, 2023
1 parent 06d9e91 commit 00c6507
Showing 1 changed file with 66 additions and 22 deletions.
88 changes: 66 additions & 22 deletions sdp/processors/datasets/mls/create_initial_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

import os
from pathlib import Path
from typing import Optional

import sox
import librosa
from sox import Transformer

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import download_file, extract_archive

MLS_URL = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
MLS_URL_NO_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
MLS_URL_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}_opus.tar.gz"


class CreateInitialManifestMLS(BaseParallelProcessor):
Expand All @@ -39,12 +42,20 @@ class CreateInitialManifestMLS(BaseParallelProcessor):
This will be used to format the URL from which we attempt to download the data.
E.g., "english", "italian", "spanish", etc.
data_split (str): "train", "dev" or "test".
resampled_audio_dir (str): the directory where the resampled
wav files will be stored.
target_samplerate (int): sample rate (Hz) to use for resampling.
resampled_audio_dir (str or None): if specified, the directory where the resampled
wav files will be stored. If not specified, the audio will not be resampled and
the parameters `target_samplerate` and `target_nchannels` will be ignored.
target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
be ignored if `resampled_audio_dir` is `None`.
Defaults to 16000.
target_nchannels (int): number of channels to create during resampling process.
target_nchannels (int): number of channels to create during resampling process. This
parameter will be ignored if `resampled_audio_dir` is `None`.
Defaults to 1.
use_opus_archive (bool): if `True`, will use the version of the archive file which
contains audio files saved in the opus format, instead of flac. If this parameter
is `True`, the parameter `resampled_audio_dir` must be `None`, as resampling OPUS
audio files is currently not supported.
Defaults to `False`.
Returns:
This processor generates an initial manifest file with the following fields::
Expand All @@ -61,18 +72,36 @@ def __init__(
raw_data_dir: str,
language: str,
data_split: str,
resampled_audio_dir: str,
resampled_audio_dir: Optional[str],
target_samplerate: int = 16000,
target_nchannels: int = 1,
use_opus_archive: bool = False,
**kwargs,
):
super().__init__(**kwargs)
self.raw_data_dir = Path(raw_data_dir)
self.language = language
self.data_split = data_split
self.resampled_audio_dir = Path(resampled_audio_dir)
self.resampled_audio_dir = Path(resampled_audio_dir) if resampled_audio_dir else None
self.target_samplerate = target_samplerate
self.target_nchannels = target_nchannels
self.use_opus_archive = use_opus_archive

# validate params
if self.use_opus_archive and self.resampled_audio_dir:
raise ValueError(
f"`use_opus_archive` is True and `resampled_audio_dir` is not None, but we currently do not"
" support resampling OPUS-format audio, please either set `use_opus_archive` to False or"
" resampled_audio_dir to None."
)

if not resampled_audio_dir:
logger.info(
"`resampled_audio_dir` is None => will not attempt to resample audio. Please note if you have"
" specified `target_samplerate` or `target_nchannels`, they will be ignored."
)



# will be initialized in self.prepare method
self.audio_path_prefix = None
Expand All @@ -82,10 +111,15 @@ def prepare(self):
"""Downloading and extracting data (unless already done)."""
os.makedirs(self.raw_data_dir, exist_ok=True)

url = MLS_URL.format(language=self.language)
if self.use_opus_archive:
url = MLS_URL_OPUS.format(language=self.language)
if not (self.raw_data_dir / f"mls_{self.language}_opus.tar.gz").exists():
download_file(url, str(self.raw_data_dir))

if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
download_file(url, str(self.raw_data_dir))
else:
url = MLS_URL_NO_OPUS.format(language=self.language)
if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
download_file(url, str(self.raw_data_dir))

data_folder = extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))

Expand Down Expand Up @@ -114,20 +148,30 @@ def process_dataset_entry(self, data_entry: str):
utt_id, text = data_entry.split("\t")
transcript_text = text.strip()

src_flac_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")
tgt_wav_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")
# specify src_audio_path
if self.use_opus_archive:
src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".opus")
else:
src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")

# specify tgt_audio_path
if self.resampled_audio_dir:
tgt_audio_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")

if not os.path.exists(os.path.dirname(tgt_audio_path)):
os.makedirs(os.path.dirname(tgt_audio_path), exist_ok=True)
if not os.path.exists(tgt_audio_path):
tfm = Transformer()
tfm.rate(samplerate=self.target_samplerate)
tfm.channels(n_channels=self.target_nchannels)
tfm.build(input_filepath=src_audio_path, output_filepath=tgt_audio_path)

if not os.path.exists(os.path.dirname(tgt_wav_path)):
os.makedirs(os.path.dirname(tgt_wav_path), exist_ok=True)
if not os.path.exists(tgt_wav_path):
tfm = Transformer()
tfm.rate(samplerate=self.target_samplerate)
tfm.channels(n_channels=self.target_nchannels)
tfm.build(input_filepath=src_flac_path, output_filepath=tgt_wav_path)
else:
tgt_audio_path = src_audio_path

data = {
"audio_filepath": tgt_wav_path,
"duration": float(sox.file_info.duration(tgt_wav_path)),
"audio_filepath": tgt_audio_path,
"duration": float(librosa.get_duration(path=tgt_audio_path)),
"text": transcript_text,
}

Expand Down

0 comments on commit 00c6507

Please sign in to comment.