allow using opus archive and not resampling

Signed-off-by: Elena Rastorgueva <[email protected]>
pzelasko · Jul 17, 2023 · 00c6507 · 00c6507
1 parent 06d9e91
commit 00c6507
Showing 1 changed file with 66 additions and 22 deletions.
diff --git a/sdp/processors/datasets/mls/create_initial_manifest.py b/sdp/processors/datasets/mls/create_initial_manifest.py
@@ -14,14 +14,17 @@
 
 import os
 from pathlib import Path
+from typing import Optional
 
-import sox
+import librosa
 from sox import Transformer
 
+from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
 from sdp.utils.common import download_file, extract_archive
 
-MLS_URL = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
+MLS_URL_NO_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
+MLS_URL_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}_opus.tar.gz"
 
 
 class CreateInitialManifestMLS(BaseParallelProcessor):
@@ -39,12 +42,20 @@ class CreateInitialManifestMLS(BaseParallelProcessor):
             This will be used to format the URL from which we attempt to download the data.
             E.g., "english", "italian", "spanish", etc.
         data_split (str): "train", "dev" or "test".
-        resampled_audio_dir (str): the directory where the resampled
-            wav files will be stored.
-        target_samplerate (int): sample rate (Hz) to use for resampling.
+        resampled_audio_dir (str or None): if specified, the directory where the resampled
+            wav files will be stored. If not specified, the audio will not be resampled and
+            the parameters `target_samplerate` and `target_nchannels` will be ignored.
+        target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
+            be ignored if `resampled_audio_dir` is `None`.
             Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process.
+        target_nchannels (int): number of channels to create during resampling process. This
+            parameter will be ignored if `resampled_audio_dir` is `None`.
             Defaults to 1.
+        use_opus_archive (bool): if `True`, will use the version of the archive file which
+            contains audio files saved in the opus format, instead of flac. If this parameter
+            is `True`, the parameter `resampled_audio_dir` must be `None`, as resampling OPUS
+            audio files is currently not supported.
+            Defaults to `False`.
 
     Returns:
         This processor generates an initial manifest file with the following fields::
@@ -61,18 +72,36 @@ def __init__(
         raw_data_dir: str,
         language: str,
         data_split: str,
-        resampled_audio_dir: str,
+        resampled_audio_dir: Optional[str],
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
+        use_opus_archive: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.raw_data_dir = Path(raw_data_dir)
         self.language = language
         self.data_split = data_split
-        self.resampled_audio_dir = Path(resampled_audio_dir)
+        self.resampled_audio_dir = Path(resampled_audio_dir) if resampled_audio_dir else None
         self.target_samplerate = target_samplerate
         self.target_nchannels = target_nchannels
+        self.use_opus_archive = use_opus_archive
+
+        # validate params
+        if self.use_opus_archive and self.resampled_audio_dir:
+            raise ValueError(
+                f"`use_opus_archive` is True and `resampled_audio_dir` is not None, but we currently do not"
+                " support resampling OPUS-format audio, please either set `use_opus_archive` to False or"
+                " resampled_audio_dir to None."
+            )
+
+        if not resampled_audio_dir:
+            logger.info(
+                "`resampled_audio_dir` is None => will not attempt to resample audio. Please note if you have"
+                " specified `target_samplerate` or `target_nchannels`, they will be ignored."
+            )
+
+
 
         # will be initialized in self.prepare method
         self.audio_path_prefix = None
@@ -82,10 +111,15 @@ def prepare(self):
         """Downloading and extracting data (unless already done)."""
         os.makedirs(self.raw_data_dir, exist_ok=True)
 
-        url = MLS_URL.format(language=self.language)
+        if self.use_opus_archive:
+            url = MLS_URL_OPUS.format(language=self.language)
+            if not (self.raw_data_dir / f"mls_{self.language}_opus.tar.gz").exists():
+                download_file(url, str(self.raw_data_dir))
 
-        if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
-            download_file(url, str(self.raw_data_dir))
+        else:
+            url = MLS_URL_NO_OPUS.format(language=self.language)
+            if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
+                download_file(url, str(self.raw_data_dir))
 
         data_folder = extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
 
@@ -114,20 +148,30 @@ def process_dataset_entry(self, data_entry: str):
         utt_id, text = data_entry.split("\t")
         transcript_text = text.strip()
 
-        src_flac_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")
-        tgt_wav_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")
+        # specify src_audio_path
+        if self.use_opus_archive:
+            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".opus")
+        else:
+            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")
+
+        # specify tgt_audio_path
+        if self.resampled_audio_dir:
+            tgt_audio_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")
+
+            if not os.path.exists(os.path.dirname(tgt_audio_path)):
+                os.makedirs(os.path.dirname(tgt_audio_path), exist_ok=True)
+            if not os.path.exists(tgt_audio_path):
+                tfm = Transformer()
+                tfm.rate(samplerate=self.target_samplerate)
+                tfm.channels(n_channels=self.target_nchannels)
+                tfm.build(input_filepath=src_audio_path, output_filepath=tgt_audio_path)
 
-        if not os.path.exists(os.path.dirname(tgt_wav_path)):
-            os.makedirs(os.path.dirname(tgt_wav_path), exist_ok=True)
-        if not os.path.exists(tgt_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=src_flac_path, output_filepath=tgt_wav_path)
+        else:
+            tgt_audio_path = src_audio_path
 
         data = {
-            "audio_filepath": tgt_wav_path,
-            "duration": float(sox.file_info.duration(tgt_wav_path)),
+            "audio_filepath": tgt_audio_path,
+            "duration": float(librosa.get_duration(path=tgt_audio_path)),
             "text": transcript_text,
         }