Skip to content

Commit

Permalink
Add amplification and soundfile optional (#125)
Browse files Browse the repository at this point in the history
Co-authored-by: Christiaan van Luik <[email protected]>
  • Loading branch information
cvl01 and Christiaan van Luik authored Oct 8, 2024
1 parent 4ca6afe commit f273328
Show file tree
Hide file tree
Showing 9 changed files with 49 additions and 11 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,11 @@ output_file_paths_6 = separator.separate('audio3.wav')
- output_dir: (Optional) Directory where the separated files will be saved. If not specified, uses the current directory.
- output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV
- normalization_threshold: (Optional) The amount by which the amplitude of the output audio will be multiplied. Default: 0.9
- amplification_threshold: (Optional) The minimum amplitude level at which the waveform will be amplified. If the peak amplitude of the audio is below this threshold, the waveform will be scaled up to meet it. Default: 0.6
- output_single_stem: (Optional) Output only a single stem, such as 'Instrumental' and 'Vocals'. Default: None
- invert_using_spec: (Optional) Flag to invert using spectrogram. Default: False
- sample_rate: (Optional) Set the sample rate of the output audio. Default: 44100
- use_soundfile: (Optional) Use soundfile for output writing, can solve OOM issues, especially on longer audio.
- mdx_params: (Optional) MDX Architecture Specific Attributes & Defaults. Default: {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}
- vr_params: (Optional) VR Architecture Specific Attributes & Defaults. Default: {"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}
- demucs_params: (Optional) VR Architecture Specific Attributes & Defaults. {"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}
Expand Down
4 changes: 2 additions & 2 deletions audio_separator/separator/architectures/mdx_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def separate(self, audio_file_path):
mix = self.prepare_mix(self.audio_file_path)

self.logger.debug("Normalizing mix before demixing...")
mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)

# Start the demixing process
source = self.demix(mix)
Expand All @@ -166,7 +166,7 @@ def separate(self, audio_file_path):
# Normalize and transpose the primary source if it's not already an array
if not isinstance(self.primary_source, np.ndarray):
self.logger.debug("Normalizing primary source...")
self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T
self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T

# Process the secondary source if not already an array
if not isinstance(self.secondary_source, np.ndarray):
Expand Down
6 changes: 3 additions & 3 deletions audio_separator/separator/architectures/mdxc_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def separate(self, audio_file_path):
mix = self.prepare_mix(self.audio_file_path)

self.logger.debug("Normalizing mix before demixing...")
mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)

source = self.demix(mix=mix)
self.logger.debug("Demixing completed.")
Expand All @@ -145,11 +145,11 @@ def separate(self, audio_file_path):

if not isinstance(self.primary_source, np.ndarray):
self.logger.debug(f"Normalizing primary source for primary stem {self.primary_stem_name}...")
self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold).T
self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T

if not isinstance(self.secondary_source, np.ndarray):
self.logger.debug(f"Normalizing secondary source for secondary stem {self.secondary_stem_name}...")
self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold).T
self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T

if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
Expand Down
8 changes: 5 additions & 3 deletions audio_separator/separator/common_separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,12 @@ def __init__(self, config):

# Functional options which are applicable to all architectures and the user may tweak to affect the output
self.normalization_threshold = config.get("normalization_threshold")
self.amplification_threshold = config.get("amplification_threshold")
self.enable_denoise = config.get("enable_denoise")
self.output_single_stem = config.get("output_single_stem")
self.invert_using_spec = config.get("invert_using_spec")
self.sample_rate = config.get("sample_rate")
self.use_soundfile = config.get("use_soundfile")

# Model specific properties

Expand All @@ -103,7 +105,7 @@ def __init__(self, config):

self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}")
self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}, amplification_threshold={self.amplification_threshold}")
self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}")

Expand Down Expand Up @@ -241,7 +243,7 @@ def write_audio(self, stem_path: str, stem_source):
duration_hours = duration_seconds / 3600
self.logger.info(f"Audio duration is {duration_hours:.2f} hours ({duration_seconds:.2f} seconds).")

if duration_hours >= 1:
if self.use_soundfile:
self.logger.warning(f"Using soundfile for writing.")
self.write_audio_soundfile(stem_path, stem_source)
else:
Expand All @@ -254,7 +256,7 @@ def write_audio_pydub(self, stem_path: str, stem_source):
"""
self.logger.debug(f"Entering write_audio_pydub with stem_path: {stem_path}")

stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold)
stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)

# Check if the numpy array is empty or contains very low values
if np.max(np.abs(stem_source)) < 1e-6:
Expand Down
13 changes: 13 additions & 0 deletions audio_separator/separator/separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ class Separator:
output_dir (str): The directory where output files will be saved.
output_format (str): The format of the output audio file.
output_bitrate (str): The bitrate of the output audio file.
amplification_threshold (float): The threshold for audio amplification.
normalization_threshold (float): The threshold for audio normalization.
output_single_stem (str): Option to output a single stem.
invert_using_spec (bool): Flag to invert using spectrogram.
sample_rate (int): The sample rate of the audio.
use_soundfile (bool): Use soundfile for audio writing, can solve OOM issues.
MDX Architecture Specific Attributes:
hop_length (int): The hop length for STFT.
Expand Down Expand Up @@ -71,9 +73,11 @@ def __init__(
output_format="WAV",
output_bitrate=None,
normalization_threshold=0.9,
amplification_threshold=0.6,
output_single_stem=None,
invert_using_spec=False,
sample_rate=44100,
use_soundfile=False,
mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
Expand Down Expand Up @@ -123,6 +127,10 @@ def __init__(
self.normalization_threshold = normalization_threshold
if normalization_threshold <= 0 or normalization_threshold > 1:
raise ValueError("The normalization_threshold must be greater than 0 and less than or equal to 1.")

self.amplification_threshold = amplification_threshold
if amplification_threshold <= 0 or amplification_threshold > 1:
raise ValueError("The amplification_threshold must be greater than 0 and less than or equal to 1.")

self.output_single_stem = output_single_stem
if output_single_stem is not None:
Expand All @@ -140,6 +148,8 @@ def __init__(
raise ValueError(f"The sample rate setting is {self.sample_rate}. Enter something less ambitious.")
except ValueError:
raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")

self.use_soundfile = use_soundfile

# These are parameters which users may want to configure so we expose them to the top-level Separator class,
# even though they are specific to a single model architecture
Expand Down Expand Up @@ -684,9 +694,11 @@ def load_model(self, model_filename="model_mel_band_roformer_ep_3005_sdr_11.4360
"output_bitrate": self.output_bitrate,
"output_dir": self.output_dir,
"normalization_threshold": self.normalization_threshold,
"amplification_threshold": self.amplification_threshold,
"output_single_stem": self.output_single_stem,
"invert_using_spec": self.invert_using_spec,
"sample_rate": self.sample_rate,
"use_soundfile": self.use_soundfile
}

# Instantiate the appropriate separator class depending on the model type
Expand Down Expand Up @@ -730,6 +742,7 @@ def separate(self, audio_file_path):
separate_start_time = time.perf_counter()

self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.")
self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will scaled up to this max amplitude if below it.")

# Run separation method for the loaded model
output_files = self.model_instance.separate(audio_file_path)
Expand Down
6 changes: 4 additions & 2 deletions audio_separator/separator/uvr_lib_v5/spec_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def make_padding(width, cropsize, offset):
return left, right, roi_size


def normalize(wave, max_peak=1.0):
"""Normalize audio waveform to a specified peak value.
def normalize(wave, max_peak=1.0, min_peak=None):
"""Normalize (or amplify) audio waveform to a specified peak value.
Args:
wave (array-like): Audio waveform.
Expand All @@ -109,6 +109,8 @@ def normalize(wave, max_peak=1.0):
maxv = np.abs(wave).max()
if maxv > max_peak:
wave *= max_peak / maxv
elif min_peak is not None and maxv < min_peak:
wave *= min_peak / maxv

return wave

Expand Down
5 changes: 5 additions & 0 deletions audio_separator/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,18 @@ def main():

invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect"
normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7"
amplification_help = "min peak amplitude to amplify input and output audio to (default: %(default)s). Example: --amplification=0.4"
single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
use_soundfile_help = "Use soundfile to write audio output (default: %(default)s). Example: --use_soundfile"

common_params = parser.add_argument_group("Common Separation Parameters")
common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help)
common_params.add_argument("--normalization", type=float, default=0.9, help=normalization_help)
common_params.add_argument("--amplification", type=float, default=0.6, help=amplification_help)
common_params.add_argument("--single_stem", default=None, help=single_stem_help)
common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
common_params.add_argument("--use_soundfile", action="store_true", help=use_soundfile_help)

mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
Expand Down Expand Up @@ -155,6 +159,7 @@ def main():
output_format=args.output_format,
output_bitrate=args.output_bitrate,
normalization_threshold=args.normalization,
amplification_threshold=args.amplification,
output_single_stem=args.single_stem,
invert_using_spec=args.invert_spect,
sample_rate=args.sample_rate,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "audio-separator"
version = "0.21.1"
version = "0.21.2"
description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07"
authors = ["Andrew Beveridge <[email protected]>"]
license = "MIT"
Expand Down
14 changes: 14 additions & 0 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,20 @@ def test_cli_normalization_threshold_argument(common_expected_args):
# Assertions
mock_separator.assert_called_once_with(**common_expected_args)

# Test using normalization_threshold argument
def test_cli_amplification_threshold_argument(common_expected_args):
test_args = ["cli.py", "test_audio.mp3", "--amplification=0.75"]
with patch("sys.argv", test_args):
with patch("audio_separator.separator.Separator") as mock_separator:
mock_separator_instance = mock_separator.return_value
mock_separator_instance.separate.return_value = ["output_file.mp3"]
main()

# Update expected args for this specific test
common_expected_args["amplification_threshold"] = 0.75

# Assertions
mock_separator.assert_called_once_with(**common_expected_args)

# Test using single stem argument
def test_cli_single_stem_argument(common_expected_args):
Expand Down

0 comments on commit f273328

Please sign in to comment.