Add amplification and soundfile optional (#125)

Co-authored-by: Christiaan van Luik <[email protected]>
nomadkaraoke · Oct 8, 2024 · f273328 · f273328
1 parent 4ca6afe
commit f273328
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -310,9 +310,11 @@ output_file_paths_6 = separator.separate('audio3.wav')
 - output_dir: (Optional) Directory where the separated files will be saved. If not specified, uses the current directory.
 - output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV
 - normalization_threshold: (Optional) The amount by which the amplitude of the output audio will be multiplied. Default: 0.9
+- amplification_threshold: (Optional) The minimum amplitude level at which the waveform will be amplified. If the peak amplitude of the audio is below this threshold, the waveform will be scaled up to meet it. Default: 0.6
 - output_single_stem: (Optional) Output only a single stem, such as 'Instrumental' and 'Vocals'. Default: None
 - invert_using_spec: (Optional) Flag to invert using spectrogram. Default: False
 - sample_rate: (Optional) Set the sample rate of the output audio. Default: 44100
+- use_soundfile: (Optional) Use soundfile for output writing, can solve OOM issues, especially on longer audio. 
 - mdx_params: (Optional) MDX Architecture Specific Attributes & Defaults. Default: {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}
 - vr_params: (Optional) VR Architecture Specific Attributes & Defaults. Default: {"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}
 - demucs_params: (Optional) VR Architecture Specific Attributes & Defaults. {"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}

diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py
@@ -151,7 +151,7 @@ def separate(self, audio_file_path):
         mix = self.prepare_mix(self.audio_file_path)
 
         self.logger.debug("Normalizing mix before demixing...")
-        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
 
         # Start the demixing process
         source = self.demix(mix)
@@ -166,7 +166,7 @@ def separate(self, audio_file_path):
         # Normalize and transpose the primary source if it's not already an array
         if not isinstance(self.primary_source, np.ndarray):
             self.logger.debug("Normalizing primary source...")
-            self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T
+            self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T
 
         # Process the secondary source if not already an array
         if not isinstance(self.secondary_source, np.ndarray):

diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py
@@ -132,7 +132,7 @@ def separate(self, audio_file_path):
         mix = self.prepare_mix(self.audio_file_path)
 
         self.logger.debug("Normalizing mix before demixing...")
-        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
 
         source = self.demix(mix=mix)
         self.logger.debug("Demixing completed.")
@@ -145,11 +145,11 @@ def separate(self, audio_file_path):
 
             if not isinstance(self.primary_source, np.ndarray):
                 self.logger.debug(f"Normalizing primary source for primary stem {self.primary_stem_name}...")
-                self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold).T
+                self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T
 
             if not isinstance(self.secondary_source, np.ndarray):
                 self.logger.debug(f"Normalizing secondary source for secondary stem {self.secondary_stem_name}...")
-                self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold).T
+                self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T
 
             if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
                 self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")

diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py
@@ -76,10 +76,12 @@ def __init__(self, config):
 
         # Functional options which are applicable to all architectures and the user may tweak to affect the output
         self.normalization_threshold = config.get("normalization_threshold")
+        self.amplification_threshold = config.get("amplification_threshold")
         self.enable_denoise = config.get("enable_denoise")
         self.output_single_stem = config.get("output_single_stem")
         self.invert_using_spec = config.get("invert_using_spec")
         self.sample_rate = config.get("sample_rate")
+        self.use_soundfile = config.get("use_soundfile")
 
         # Model specific properties
 
@@ -103,7 +105,7 @@ def __init__(self, config):
 
         self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
         self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
-        self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}")
+        self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}, amplification_threshold={self.amplification_threshold}")
         self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
         self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}")
 
@@ -241,7 +243,7 @@ def write_audio(self, stem_path: str, stem_source):
         duration_hours = duration_seconds / 3600
         self.logger.info(f"Audio duration is {duration_hours:.2f} hours ({duration_seconds:.2f} seconds).")
 
-        if duration_hours >= 1:
+        if self.use_soundfile:
             self.logger.warning(f"Using soundfile for writing.")
             self.write_audio_soundfile(stem_path, stem_source)
         else:
@@ -254,7 +256,7 @@ def write_audio_pydub(self, stem_path: str, stem_source):
         """
         self.logger.debug(f"Entering write_audio_pydub with stem_path: {stem_path}")
 
-        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold)
+        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
 
         # Check if the numpy array is empty or contains very low values
         if np.max(np.abs(stem_source)) < 1e-6:

diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
@@ -37,10 +37,12 @@ class Separator:
         output_dir (str): The directory where output files will be saved.
         output_format (str): The format of the output audio file.
         output_bitrate (str): The bitrate of the output audio file.
+        amplification_threshold (float): The threshold for audio amplification.
         normalization_threshold (float): The threshold for audio normalization.
         output_single_stem (str): Option to output a single stem.
         invert_using_spec (bool): Flag to invert using spectrogram.
         sample_rate (int): The sample rate of the audio.
+        use_soundfile (bool): Use soundfile for audio writing, can solve OOM issues.
 
     MDX Architecture Specific Attributes:
         hop_length (int): The hop length for STFT.
@@ -71,9 +73,11 @@ def __init__(
         output_format="WAV",
         output_bitrate=None,
         normalization_threshold=0.9,
+        amplification_threshold=0.6,
         output_single_stem=None,
         invert_using_spec=False,
         sample_rate=44100,
+        use_soundfile=False,
         mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
         vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
         demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
@@ -123,6 +127,10 @@ def __init__(
         self.normalization_threshold = normalization_threshold
         if normalization_threshold <= 0 or normalization_threshold > 1:
             raise ValueError("The normalization_threshold must be greater than 0 and less than or equal to 1.")
+
+        self.amplification_threshold = amplification_threshold
+        if amplification_threshold <= 0 or amplification_threshold > 1:
+            raise ValueError("The amplification_threshold must be greater than 0 and less than or equal to 1.")
 
         self.output_single_stem = output_single_stem
         if output_single_stem is not None:
@@ -140,6 +148,8 @@ def __init__(
                 raise ValueError(f"The sample rate setting is {self.sample_rate}. Enter something less ambitious.")
         except ValueError:
             raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")
+
+        self.use_soundfile = use_soundfile
 
         # These are parameters which users may want to configure so we expose them to the top-level Separator class,
         # even though they are specific to a single model architecture
@@ -684,9 +694,11 @@ def load_model(self, model_filename="model_mel_band_roformer_ep_3005_sdr_11.4360
             "output_bitrate": self.output_bitrate,
             "output_dir": self.output_dir,
             "normalization_threshold": self.normalization_threshold,
+            "amplification_threshold": self.amplification_threshold,
             "output_single_stem": self.output_single_stem,
             "invert_using_spec": self.invert_using_spec,
             "sample_rate": self.sample_rate,
+            "use_soundfile": self.use_soundfile
         }
 
         # Instantiate the appropriate separator class depending on the model type
@@ -730,6 +742,7 @@ def separate(self, audio_file_path):
         separate_start_time = time.perf_counter()
 
         self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.")
+        self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will scaled up to this max amplitude if below it.")
 
         # Run separation method for the loaded model
         output_files = self.model_instance.separate(audio_file_path)

diff --git a/audio_separator/separator/uvr_lib_v5/spec_utils.py b/audio_separator/separator/uvr_lib_v5/spec_utils.py
@@ -96,8 +96,8 @@ def make_padding(width, cropsize, offset):
     return left, right, roi_size
 
 
-def normalize(wave, max_peak=1.0):
-    """Normalize audio waveform to a specified peak value.
+def normalize(wave, max_peak=1.0, min_peak=None):
+    """Normalize (or amplify) audio waveform to a specified peak value.
 
     Args:
         wave (array-like): Audio waveform.
@@ -109,6 +109,8 @@ def normalize(wave, max_peak=1.0):
     maxv = np.abs(wave).max()
     if maxv > max_peak:
         wave *= max_peak / maxv
+    elif min_peak is not None and maxv < min_peak:
+        wave *= min_peak / maxv
 
     return wave
 

diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py
@@ -51,14 +51,18 @@ def main():
 
     invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect"
     normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7"
+    amplification_help = "min peak amplitude to amplify input and output audio to (default: %(default)s). Example: --amplification=0.4"
     single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
     sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
+    use_soundfile_help = "Use soundfile to write audio output (default: %(default)s). Example: --use_soundfile"
 
     common_params = parser.add_argument_group("Common Separation Parameters")
     common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help)
     common_params.add_argument("--normalization", type=float, default=0.9, help=normalization_help)
+    common_params.add_argument("--amplification", type=float, default=0.6, help=amplification_help)
     common_params.add_argument("--single_stem", default=None, help=single_stem_help)
     common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
+    common_params.add_argument("--use_soundfile", action="store_true", help=use_soundfile_help)
 
     mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
     mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
@@ -155,6 +159,7 @@ def main():
         output_format=args.output_format,
         output_bitrate=args.output_bitrate,
         normalization_threshold=args.normalization,
+        amplification_threshold=args.amplification,
         output_single_stem=args.single_stem,
         invert_using_spec=args.invert_spect,
         sample_rate=args.sample_rate,

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "audio-separator"
-version = "0.21.1"
+version = "0.21.2"
 description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07"
 authors = ["Andrew Beveridge <[email protected]>"]
 license = "MIT"

diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -150,6 +150,20 @@ def test_cli_normalization_threshold_argument(common_expected_args):
             # Assertions
             mock_separator.assert_called_once_with(**common_expected_args)
 
+# Test using normalization_threshold argument
+def test_cli_amplification_threshold_argument(common_expected_args):
+    test_args = ["cli.py", "test_audio.mp3", "--amplification=0.75"]
+    with patch("sys.argv", test_args):
+        with patch("audio_separator.separator.Separator") as mock_separator:
+            mock_separator_instance = mock_separator.return_value
+            mock_separator_instance.separate.return_value = ["output_file.mp3"]
+            main()
+
+            # Update expected args for this specific test
+            common_expected_args["amplification_threshold"] = 0.75
+
+            # Assertions
+            mock_separator.assert_called_once_with(**common_expected_args)
 
 # Test using single stem argument
 def test_cli_single_stem_argument(common_expected_args):