From 063121039d0676299f84a2df59582c1a348efdb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20P=C3=A9rez?= Date: Mon, 1 Nov 2021 12:53:43 +0100 Subject: [PATCH] format files with Black --- Installation/nnAudio/Spectrogram.py | 7 +- Installation/nnAudio/__init__.py | 2 +- Installation/nnAudio/features/__init__.py | 2 +- Installation/nnAudio/features/cfp.py | 228 ++++--- Installation/nnAudio/features/cqt.py | 563 +++++++++++------- Installation/nnAudio/features/gammatone.py | 67 ++- Installation/nnAudio/features/griffin_lim.py | 86 +-- Installation/nnAudio/features/mel.py | 101 +++- Installation/nnAudio/features/stft.py | 230 ++++--- Installation/nnAudio/librosa_functions.py | 253 +++++--- Installation/nnAudio/utils.py | 367 +++++++----- Installation/setup.py | 25 +- Installation/tests/parameters.py | 24 +- Installation/tests/test_cfp.py | 102 ++++ Installation/tests/test_cqt.py | 292 +++++++++ .../tests/{tests_stft.py => test_stft.py} | 62 +- Installation/tests/tests_cfp.py | 89 --- Installation/tests/tests_cqt.py | 232 -------- Sphinx/source/conf.py | 87 +-- 19 files changed, 1752 insertions(+), 1067 deletions(-) create mode 100644 Installation/tests/test_cfp.py create mode 100644 Installation/tests/test_cqt.py rename Installation/tests/{tests_stft.py => test_stft.py} (73%) delete mode 100644 Installation/tests/tests_cfp.py delete mode 100644 Installation/tests/tests_cqt.py diff --git a/Installation/nnAudio/Spectrogram.py b/Installation/nnAudio/Spectrogram.py index 7e683ca..8c98e36 100755 --- a/Installation/nnAudio/Spectrogram.py +++ b/Installation/nnAudio/Spectrogram.py @@ -1,5 +1,8 @@ from .features import * import warnings -warnings.warn("importing Spectrogram subpackage will be deprecated soon. You should import the feature extractor " - "from the feature subpackage. See actual documentation.", category=Warning) +warnings.warn( + "importing Spectrogram subpackage will be deprecated soon. You should import the feature extractor " + "from the feature subpackage. See actual documentation.", + category=Warning, +) diff --git a/Installation/nnAudio/__init__.py b/Installation/nnAudio/__init__.py index d1eb742..01ef120 100755 --- a/Installation/nnAudio/__init__.py +++ b/Installation/nnAudio/__init__.py @@ -1 +1 @@ -__version__ = "0.2.6" \ No newline at end of file +__version__ = "0.2.6" diff --git a/Installation/nnAudio/features/__init__.py b/Installation/nnAudio/features/__init__.py index 1118ccb..9dc94f8 100644 --- a/Installation/nnAudio/features/__init__.py +++ b/Installation/nnAudio/features/__init__.py @@ -10,4 +10,4 @@ from .gammatone import * from .griffin_lim import * from .mel import * -from .stft import * \ No newline at end of file +from .stft import * diff --git a/Installation/nnAudio/features/cfp.py b/Installation/nnAudio/features/cfp.py index b8900f3..19429e1 100644 --- a/Installation/nnAudio/features/cfp.py +++ b/Installation/nnAudio/features/cfp.py @@ -63,9 +63,17 @@ class Combined_Frequency_Periodicity(nn.Module): """ - def __init__(self, fr=2, fs=16000, hop_length=320, - window_size=2049, fc=80, tc=1 / 1000, - g=[0.24, 0.6, 1], NumPerOct=48): + def __init__( + self, + fr=2, + fs=16000, + hop_length=320, + window_size=2049, + fc=80, + tc=1 / 1000, + g=[0.24, 0.6, 1], + NumPerOct=48, + ): super().__init__() self.window_size = window_size @@ -73,29 +81,40 @@ def __init__(self, fr=2, fs=16000, hop_length=320, # variables for STFT part self.N = int(fs / float(fr)) # Will be used to calculate padding - self.f = fs * np.linspace(0, 0.5, np.round(self.N // 2), - endpoint=True) # it won't be used but will be returned - self.pad_value = ((self.N - window_size)) + self.f = fs * np.linspace( + 0, 0.5, np.round(self.N // 2), endpoint=True + ) # it won't be used but will be returned + self.pad_value = self.N - window_size # Create window function, always blackmanharris? h = scipy.signal.blackmanharris(window_size) # window function for STFT - self.register_buffer('h', torch.tensor(h).float()) + self.register_buffer("h", torch.tensor(h).float()) # variables for CFP self.NumofLayer = np.size(g) self.g = g - self.tc_idx = round(fs * tc) # index to filter out top tc_idx and bottom tc_idx bins - self.fc_idx = round(fc / fr) # index to filter out top fc_idx and bottom fc_idx bins + self.tc_idx = round( + fs * tc + ) # index to filter out top tc_idx and bottom tc_idx bins + self.fc_idx = round( + fc / fr + ) # index to filter out top fc_idx and bottom fc_idx bins self.HighFreqIdx = int(round((1 / tc) / fr) + 1) self.HighQuefIdx = int(round(fs / fc) + 1) # attributes to be returned - self.f = self.f[:self.HighFreqIdx] + self.f = self.f[: self.HighFreqIdx] self.q = np.arange(self.HighQuefIdx) / float(fs) # filters for the final step - freq2logfreq_matrix, quef2logfreq_matrix = self.create_logfreq_matrix(self.f, self.q, fr, fc, tc, NumPerOct, fs) - self.register_buffer('freq2logfreq_matrix', torch.tensor(freq2logfreq_matrix).float()) - self.register_buffer('quef2logfreq_matrix', torch.tensor(quef2logfreq_matrix).float()) + freq2logfreq_matrix, quef2logfreq_matrix = self.create_logfreq_matrix( + self.f, self.q, fr, fc, tc, NumPerOct, fs + ) + self.register_buffer( + "freq2logfreq_matrix", torch.tensor(freq2logfreq_matrix).float() + ) + self.register_buffer( + "quef2logfreq_matrix", torch.tensor(quef2logfreq_matrix).float() + ) def _CFP(self, spec): spec = torch.relu(spec).pow(self.g[0]) @@ -103,42 +122,59 @@ def _CFP(self, spec): if self.NumofLayer >= 2: for gc in range(1, self.NumofLayer): if np.remainder(gc, 2) == 1: - ceps = rfft_fn(spec, 1, onesided=False)[:, :, :, 0] / np.sqrt(self.N) + ceps = rfft_fn(spec, 1, onesided=False)[:, :, :, 0] / np.sqrt( + self.N + ) ceps = self.nonlinear_func(ceps, self.g[gc], self.tc_idx) else: - spec = rfft_fn(ceps, 1, onesided=False)[:, :, :, 0] / np.sqrt(self.N) + spec = rfft_fn(ceps, 1, onesided=False)[:, :, :, 0] / np.sqrt( + self.N + ) spec = self.nonlinear_func(spec, self.g[gc], self.fc_idx) return spec, ceps def forward(self, x): - tfr0 = torch.stft(x, self.N, hop_length=self.hop_length, win_length=self.window_size, - window=self.h, onesided=False, pad_mode='constant') - tfr0 = torch.sqrt(tfr0.pow(2).sum(-1)) / torch.norm(self.h) # calcuate magnitude - tfr0 = tfr0.transpose(1, 2)[:, 1:-1] # transpose F and T axis and discard first and last frames + tfr0 = torch.stft( + x, + self.N, + hop_length=self.hop_length, + win_length=self.window_size, + window=self.h, + onesided=False, + pad_mode="constant", + ) + tfr0 = torch.sqrt(tfr0.pow(2).sum(-1)) / torch.norm( + self.h + ) # calcuate magnitude + tfr0 = tfr0.transpose(1, 2)[ + :, 1:-1 + ] # transpose F and T axis and discard first and last frames # The transpose is necessary for rfft later # (batch, timesteps, n_fft) tfr, ceps = self._CFP(tfr0) # return tfr0 # removing duplicate bins - tfr0 = tfr0[:, :, :int(round(self.N / 2))] - tfr = tfr[:, :, :int(round(self.N / 2))] - ceps = ceps[:, :, :int(round(self.N / 2))] + tfr0 = tfr0[:, :, : int(round(self.N / 2))] + tfr = tfr[:, :, : int(round(self.N / 2))] + ceps = ceps[:, :, : int(round(self.N / 2))] # Crop up to the highest frequency - tfr0 = tfr0[:, :, :self.HighFreqIdx] - tfr = tfr[:, :, :self.HighFreqIdx] - ceps = ceps[:, :, :self.HighQuefIdx] + tfr0 = tfr0[:, :, : self.HighFreqIdx] + tfr = tfr[:, :, : self.HighFreqIdx] + ceps = ceps[:, :, : self.HighQuefIdx] tfrL0 = torch.matmul(self.freq2logfreq_matrix, tfr0.transpose(1, 2)) tfrLF = torch.matmul(self.freq2logfreq_matrix, tfr.transpose(1, 2)) tfrLQ = torch.matmul(self.quef2logfreq_matrix, ceps.transpose(1, 2)) Z = tfrLF * tfrLQ # Only need to calculate this once - self.t = np.arange(self.hop_length, - np.ceil(len(x) / float(self.hop_length)) * self.hop_length, - self.hop_length) # it won't be used but will be returned + self.t = np.arange( + self.hop_length, + np.ceil(len(x) / float(self.hop_length)) * self.hop_length, + self.hop_length, + ) # it won't be used but will be returned return Z, tfrL0, tfrLF, tfrLQ @@ -181,23 +217,30 @@ def create_logfreq_matrix(self, f, q, fr, fc, tc, NumPerOct, fs): else: for j in range(l, r): if f[j] > central_freq[i - 1] and f[j] < central_freq[i]: - freq_band_transformation[i, j] = (f[j] - central_freq[i - 1]) / ( - central_freq[i] - central_freq[i - 1]) + freq_band_transformation[i, j] = ( + f[j] - central_freq[i - 1] + ) / (central_freq[i] - central_freq[i - 1]) elif f[j] > central_freq[i] and f[j] < central_freq[i + 1]: - freq_band_transformation[i, j] = (central_freq[i + 1] - f[j]) / ( - central_freq[i + 1] - central_freq[i]) + freq_band_transformation[i, j] = ( + central_freq[i + 1] - f[j] + ) / (central_freq[i + 1] - central_freq[i]) # Calculating the quef_band_transformation f = 1 / q # divide by 0, do I need to fix this? quef_band_transformation = np.zeros((Nest - 1, len(f)), dtype=np.float) for i in range(1, Nest - 1): - for j in range(int(round(fs / central_freq[i + 1])), int(round(fs / central_freq[i - 1]) + 1)): + for j in range( + int(round(fs / central_freq[i + 1])), + int(round(fs / central_freq[i - 1]) + 1), + ): if f[j] > central_freq[i - 1] and f[j] < central_freq[i]: quef_band_transformation[i, j] = (f[j] - central_freq[i - 1]) / ( - central_freq[i] - central_freq[i - 1]) + central_freq[i] - central_freq[i - 1] + ) elif f[j] > central_freq[i] and f[j] < central_freq[i + 1]: quef_band_transformation[i, j] = (central_freq[i + 1] - f[j]) / ( - central_freq[i + 1] - central_freq[i]) + central_freq[i + 1] - central_freq[i] + ) return freq_band_transformation, quef_band_transformation @@ -257,9 +300,17 @@ class CFP(nn.Module): """ - def __init__(self, fr=2, fs=16000, hop_length=320, - window_size=2049, fc=80, tc=1 / 1000, - g=[0.24, 0.6, 1], NumPerOct=48): + def __init__( + self, + fr=2, + fs=16000, + hop_length=320, + window_size=2049, + fc=80, + tc=1 / 1000, + g=[0.24, 0.6, 1], + NumPerOct=48, + ): super().__init__() self.window_size = window_size @@ -267,29 +318,40 @@ def __init__(self, fr=2, fs=16000, hop_length=320, # variables for STFT part self.N = int(fs / float(fr)) # Will be used to calculate padding - self.f = fs * np.linspace(0, 0.5, np.round(self.N // 2), - endpoint=True) # it won't be used but will be returned - self.pad_value = ((self.N - window_size)) + self.f = fs * np.linspace( + 0, 0.5, np.round(self.N // 2), endpoint=True + ) # it won't be used but will be returned + self.pad_value = self.N - window_size # Create window function, always blackmanharris? h = scipy.signal.blackmanharris(window_size) # window function for STFT - self.register_buffer('h', torch.tensor(h).float()) + self.register_buffer("h", torch.tensor(h).float()) # variables for CFP self.NumofLayer = np.size(g) self.g = g - self.tc_idx = round(fs * tc) # index to filter out top tc_idx and bottom tc_idx bins - self.fc_idx = round(fc / fr) # index to filter out top fc_idx and bottom fc_idx bins + self.tc_idx = round( + fs * tc + ) # index to filter out top tc_idx and bottom tc_idx bins + self.fc_idx = round( + fc / fr + ) # index to filter out top fc_idx and bottom fc_idx bins self.HighFreqIdx = int(round((1 / tc) / fr) + 1) self.HighQuefIdx = int(round(fs / fc) + 1) # attributes to be returned - self.f = self.f[:self.HighFreqIdx] + self.f = self.f[: self.HighFreqIdx] self.q = np.arange(self.HighQuefIdx) / float(fs) # filters for the final step - freq2logfreq_matrix, quef2logfreq_matrix = self.create_logfreq_matrix(self.f, self.q, fr, fc, tc, NumPerOct, fs) - self.register_buffer('freq2logfreq_matrix', torch.tensor(freq2logfreq_matrix).float()) - self.register_buffer('quef2logfreq_matrix', torch.tensor(quef2logfreq_matrix).float()) + freq2logfreq_matrix, quef2logfreq_matrix = self.create_logfreq_matrix( + self.f, self.q, fr, fc, tc, NumPerOct, fs + ) + self.register_buffer( + "freq2logfreq_matrix", torch.tensor(freq2logfreq_matrix).float() + ) + self.register_buffer( + "quef2logfreq_matrix", torch.tensor(quef2logfreq_matrix).float() + ) def _CFP(self, spec): spec = torch.relu(spec).pow(self.g[0]) @@ -297,42 +359,59 @@ def _CFP(self, spec): if self.NumofLayer >= 2: for gc in range(1, self.NumofLayer): if np.remainder(gc, 2) == 1: - ceps = rfft_fn(spec, 1, onesided=False)[:, :, :, 0] / np.sqrt(self.N) + ceps = rfft_fn(spec, 1, onesided=False)[:, :, :, 0] / np.sqrt( + self.N + ) ceps = self.nonlinear_func(ceps, self.g[gc], self.tc_idx) else: - spec = rfft_fn(ceps, 1, onesided=False)[:, :, :, 0] / np.sqrt(self.N) + spec = rfft_fn(ceps, 1, onesided=False)[:, :, :, 0] / np.sqrt( + self.N + ) spec = self.nonlinear_func(spec, self.g[gc], self.fc_idx) return spec, ceps def forward(self, x): - tfr0 = torch.stft(x, self.N, hop_length=self.hop_length, win_length=self.window_size, - window=self.h, onesided=False, pad_mode='constant') - tfr0 = torch.sqrt(tfr0.pow(2).sum(-1)) / torch.norm(self.h) # calcuate magnitude - tfr0 = tfr0.transpose(1, 2) # transpose F and T axis and discard first and last frames + tfr0 = torch.stft( + x, + self.N, + hop_length=self.hop_length, + win_length=self.window_size, + window=self.h, + onesided=False, + pad_mode="constant", + ) + tfr0 = torch.sqrt(tfr0.pow(2).sum(-1)) / torch.norm( + self.h + ) # calcuate magnitude + tfr0 = tfr0.transpose( + 1, 2 + ) # transpose F and T axis and discard first and last frames # The transpose is necessary for rfft later # (batch, timesteps, n_fft) tfr, ceps = self._CFP(tfr0) # return tfr0 # removing duplicate bins - tfr0 = tfr0[:, :, :int(round(self.N / 2))] - tfr = tfr[:, :, :int(round(self.N / 2))] - ceps = ceps[:, :, :int(round(self.N / 2))] + tfr0 = tfr0[:, :, : int(round(self.N / 2))] + tfr = tfr[:, :, : int(round(self.N / 2))] + ceps = ceps[:, :, : int(round(self.N / 2))] # Crop up to the highest frequency - tfr0 = tfr0[:, :, :self.HighFreqIdx] - tfr = tfr[:, :, :self.HighFreqIdx] - ceps = ceps[:, :, :self.HighQuefIdx] + tfr0 = tfr0[:, :, : self.HighFreqIdx] + tfr = tfr[:, :, : self.HighFreqIdx] + ceps = ceps[:, :, : self.HighQuefIdx] tfrL0 = torch.matmul(self.freq2logfreq_matrix, tfr0.transpose(1, 2)) tfrLF = torch.matmul(self.freq2logfreq_matrix, tfr.transpose(1, 2)) tfrLQ = torch.matmul(self.quef2logfreq_matrix, ceps.transpose(1, 2)) Z = tfrLF * tfrLQ # Only need to calculate this once - self.t = np.arange(self.hop_length, - np.ceil(len(x) / float(self.hop_length)) * self.hop_length, - self.hop_length) # it won't be used but will be returned + self.t = np.arange( + self.hop_length, + np.ceil(len(x) / float(self.hop_length)) * self.hop_length, + self.hop_length, + ) # it won't be used but will be returned return Z # , tfrL0, tfrLF, tfrLQ @@ -375,22 +454,29 @@ def create_logfreq_matrix(self, f, q, fr, fc, tc, NumPerOct, fs): else: for j in range(l, r): if f[j] > central_freq[i - 1] and f[j] < central_freq[i]: - freq_band_transformation[i, j] = (f[j] - central_freq[i - 1]) / ( - central_freq[i] - central_freq[i - 1]) + freq_band_transformation[i, j] = ( + f[j] - central_freq[i - 1] + ) / (central_freq[i] - central_freq[i - 1]) elif f[j] > central_freq[i] and f[j] < central_freq[i + 1]: - freq_band_transformation[i, j] = (central_freq[i + 1] - f[j]) / ( - central_freq[i + 1] - central_freq[i]) + freq_band_transformation[i, j] = ( + central_freq[i + 1] - f[j] + ) / (central_freq[i + 1] - central_freq[i]) # Calculating the quef_band_transformation f = 1 / q # divide by 0, do I need to fix this? quef_band_transformation = np.zeros((Nest - 1, len(f)), dtype=np.float) for i in range(1, Nest - 1): - for j in range(int(round(fs / central_freq[i + 1])), int(round(fs / central_freq[i - 1]) + 1)): + for j in range( + int(round(fs / central_freq[i + 1])), + int(round(fs / central_freq[i - 1]) + 1), + ): if f[j] > central_freq[i - 1] and f[j] < central_freq[i]: quef_band_transformation[i, j] = (f[j] - central_freq[i - 1]) / ( - central_freq[i] - central_freq[i - 1]) + central_freq[i] - central_freq[i - 1] + ) elif f[j] > central_freq[i] and f[j] < central_freq[i + 1]: quef_band_transformation[i, j] = (central_freq[i + 1] - f[j]) / ( - central_freq[i + 1] - central_freq[i]) + central_freq[i + 1] - central_freq[i] + ) - return freq_band_transformation, quef_band_transformation \ No newline at end of file + return freq_band_transformation, quef_band_transformation diff --git a/Installation/nnAudio/features/cqt.py b/Installation/nnAudio/features/cqt.py index 72e2a95..82228cb 100644 --- a/Installation/nnAudio/features/cqt.py +++ b/Installation/nnAudio/features/cqt.py @@ -5,6 +5,7 @@ from ..utils import * from ..utils import * + class CQT1992(nn.Module): """ This alogrithm uses the method proposed in [1], which would run extremely slow if low frequencies (below 220Hz) @@ -102,9 +103,23 @@ class CQT1992(nn.Module): >>> specs = spec_layer(x) """ - def __init__(self, sr=22050, hop_length=512, fmin=220, fmax=None, n_bins=84, - trainable_STFT=False, trainable_CQT=False, bins_per_octave=12, filter_scale=1, - output_format='Magnitude', norm=1, window='hann', center=True, pad_mode='reflect'): + def __init__( + self, + sr=22050, + hop_length=512, + fmin=220, + fmax=None, + n_bins=84, + trainable_STFT=False, + trainable_CQT=False, + bins_per_octave=12, + filter_scale=1, + output_format="Magnitude", + norm=1, + window="hann", + center=True, + pad_mode="reflect", + ): super().__init__() @@ -118,32 +133,27 @@ def __init__(self, sr=22050, hop_length=512, fmin=220, fmax=None, n_bins=84, # creating kernels for CQT Q = float(filter_scale) / (2 ** (1 / bins_per_octave) - 1) - print("Creating CQT kernels ...", end='\r') + print("Creating CQT kernels ...", end="\r") start = time() - cqt_kernels, self.kernel_width, lenghts, freqs = create_cqt_kernels(Q, - sr, - fmin, - n_bins, - bins_per_octave, - norm, - window, - fmax) - - self.register_buffer('lenghts', lenghts) + cqt_kernels, self.kernel_width, lenghts, freqs = create_cqt_kernels( + Q, sr, fmin, n_bins, bins_per_octave, norm, window, fmax + ) + + self.register_buffer("lenghts", lenghts) self.frequencies = freqs - cqt_kernels = fft(cqt_kernels)[:, :self.kernel_width // 2 + 1] + cqt_kernels = fft(cqt_kernels)[:, : self.kernel_width // 2 + 1] print("CQT kernels created, time used = {:.4f} seconds".format(time() - start)) # creating kernels for stft # self.cqt_kernels_real*=lenghts.unsqueeze(1)/self.kernel_width # Trying to normalize as librosa # self.cqt_kernels_imag*=lenghts.unsqueeze(1)/self.kernel_width - print("Creating STFT kernels ...", end='\r') + print("Creating STFT kernels ...", end="\r") start = time() - kernel_sin, kernel_cos, self.bins2freq, _, window = create_fourier_kernels(self.kernel_width, - window='ones', - freq_scale='no') + kernel_sin, kernel_cos, self.bins2freq, _, window = create_fourier_kernels( + self.kernel_width, window="ones", freq_scale="no" + ) # Converting kernels from numpy arrays to torch tensors wsin = torch.tensor(kernel_sin * window) @@ -155,24 +165,28 @@ def __init__(self, sr=22050, hop_length=512, fmin=220, fmax=None, n_bins=84, if trainable_STFT: wsin = nn.Parameter(wsin, requires_grad=trainable_STFT) wcos = nn.Parameter(wcos, requires_grad=trainable_STFT) - self.register_parameter('wsin', wsin) - self.register_parameter('wcos', wcos) + self.register_parameter("wsin", wsin) + self.register_parameter("wcos", wcos) else: - self.register_buffer('wsin', wsin) - self.register_buffer('wcos', wcos) + self.register_buffer("wsin", wsin) + self.register_buffer("wcos", wcos) if trainable_CQT: - cqt_kernels_real = nn.Parameter(cqt_kernels_real, requires_grad=trainable_CQT) - cqt_kernels_imag = nn.Parameter(cqt_kernels_imag, requires_grad=trainable_CQT) - self.register_parameter('cqt_kernels_real', cqt_kernels_real) - self.register_parameter('cqt_kernels_imag', cqt_kernels_imag) + cqt_kernels_real = nn.Parameter( + cqt_kernels_real, requires_grad=trainable_CQT + ) + cqt_kernels_imag = nn.Parameter( + cqt_kernels_imag, requires_grad=trainable_CQT + ) + self.register_parameter("cqt_kernels_real", cqt_kernels_real) + self.register_parameter("cqt_kernels_imag", cqt_kernels_imag) else: - self.register_buffer('cqt_kernels_real', cqt_kernels_real) - self.register_buffer('cqt_kernels_imag', cqt_kernels_imag) + self.register_buffer("cqt_kernels_real", cqt_kernels_real) + self.register_buffer("cqt_kernels_imag", cqt_kernels_imag) print("STFT kernels created, time used = {:.4f} seconds".format(time() - start)) - def forward(self, x, output_format=None, normalization_type='librosa'): + def forward(self, x, output_format=None, normalization_type="librosa"): """ Convert a batch of waveforms to CQT spectrograms. @@ -189,9 +203,9 @@ def forward(self, x, output_format=None, normalization_type='librosa'): x = broadcast_dim(x) if self.center: - if self.pad_mode == 'constant': + if self.pad_mode == "constant": padding = nn.ConstantPad1d(self.kernel_width // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": padding = nn.ReflectionPad1d(self.kernel_width // 2) x = padding(x) @@ -201,39 +215,43 @@ def forward(self, x, output_format=None, normalization_type='librosa'): fourier_imag = conv1d(x, self.wsin, stride=self.hop_length) # CQT - CQT_real, CQT_imag = complex_mul((self.cqt_kernels_real, self.cqt_kernels_imag), - (fourier_real, fourier_imag)) + CQT_real, CQT_imag = complex_mul( + (self.cqt_kernels_real, self.cqt_kernels_imag), (fourier_real, fourier_imag) + ) CQT = torch.stack((CQT_real, -CQT_imag), -1) - if normalization_type == 'librosa': + if normalization_type == "librosa": CQT *= torch.sqrt(self.lenghts.view(-1, 1, 1)) / self.kernel_width - elif normalization_type == 'convolutional': + elif normalization_type == "convolutional": pass - elif normalization_type == 'wrap': + elif normalization_type == "wrap": CQT *= 2 / self.kernel_width else: - raise ValueError("The normalization_type %r is not part of our current options." % normalization_type) + raise ValueError( + "The normalization_type %r is not part of our current options." + % normalization_type + ) # if self.norm: # CQT = CQT/self.kernel_width*torch.sqrt(self.lenghts.view(-1,1,1)) # else: # CQT = CQT*torch.sqrt(self.lenghts.view(-1,1,1)) - if output_format == 'Magnitude': + if output_format == "Magnitude": # Getting CQT Amplitude return torch.sqrt(CQT.pow(2).sum(-1)) - elif output_format == 'Complex': + elif output_format == "Complex": return CQT - elif output_format == 'Phase': + elif output_format == "Phase": phase_real = torch.cos(torch.atan2(CQT_imag, CQT_real)) phase_imag = torch.sin(torch.atan2(CQT_imag, CQT_real)) return torch.stack((phase_real, phase_imag), -1) def extra_repr(self) -> str: - return 'STFT kernel size = {}, CQT kernel size = {}'.format( + return "STFT kernel size = {}, CQT kernel size = {}".format( (*self.wcos.shape,), (*self.cqt_kernels_real.shape,) ) @@ -256,39 +274,62 @@ class CQT2010(nn.Module): frequency region where freq < 40Hz. """ - def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, bins_per_octave=12, - norm=True, basis_norm=1, window='hann', pad_mode='reflect', trainable_STFT=False, filter_scale=1, - trainable_CQT=False, output_format='Magnitude', earlydownsample=True, verbose=True): + def __init__( + self, + sr=22050, + hop_length=512, + fmin=32.70, + fmax=None, + n_bins=84, + bins_per_octave=12, + norm=True, + basis_norm=1, + window="hann", + pad_mode="reflect", + trainable_STFT=False, + filter_scale=1, + trainable_CQT=False, + output_format="Magnitude", + earlydownsample=True, + verbose=True, + ): super().__init__() - self.norm = norm # Now norm is used to normalize the final CQT result by dividing n_fft + self.norm = ( + norm # Now norm is used to normalize the final CQT result by dividing n_fft + ) # basis_norm is for normalizing basis self.hop_length = hop_length self.pad_mode = pad_mode self.n_bins = n_bins self.output_format = output_format - self.earlydownsample = earlydownsample # TODO: activate early downsampling later if possible + self.earlydownsample = ( + earlydownsample # TODO: activate early downsampling later if possible + ) # This will be used to calculate filter_cutoff and creating CQT kernels Q = float(filter_scale) / (2 ** (1 / bins_per_octave) - 1) # Creating lowpass filter and make it a torch tensor if verbose == True: - print("Creating low pass filter ...", end='\r') + print("Creating low pass filter ...", end="\r") start = time() - lowpass_filter = torch.tensor(create_lowpass_filter( - band_center=0.5, - kernelLength=256, - transitionBandwidth=0.001 - ) + lowpass_filter = torch.tensor( + create_lowpass_filter( + band_center=0.5, kernelLength=256, transitionBandwidth=0.001 + ) ) # Broadcast the tensor to the shape that fits conv1d - self.register_buffer('lowpass_filter', lowpass_filter[None, None, :]) + self.register_buffer("lowpass_filter", lowpass_filter[None, None, :]) if verbose == True: - print("Low pass filter created, time used = {:.4f} seconds".format(time() - start)) + print( + "Low pass filter created, time used = {:.4f} seconds".format( + time() - start + ) + ) # Calculate num of filter requires for the kernel # n_octaves determines how many resampling requires for the CQT @@ -308,43 +349,59 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, b # Calculate the top bin frequency fmax_t = self.fmin_t * 2 ** ((remainder - 1) / bins_per_octave) - self.fmin_t = fmax_t / 2 ** (1 - 1 / bins_per_octave) # Adjusting the top minium bins + self.fmin_t = fmax_t / 2 ** ( + 1 - 1 / bins_per_octave + ) # Adjusting the top minium bins if fmax_t > sr / 2: - raise ValueError('The top bin {}Hz has exceeded the Nyquist frequency, \ - please reduce the n_bins'.format(fmax_t)) - - if self.earlydownsample == True: # Do early downsampling if this argument is True + raise ValueError( + "The top bin {}Hz has exceeded the Nyquist frequency, \ + please reduce the n_bins".format( + fmax_t + ) + ) + + if ( + self.earlydownsample == True + ): # Do early downsampling if this argument is True if verbose == True: - print("Creating early downsampling filter ...", end='\r') + print("Creating early downsampling filter ...", end="\r") start = time() - sr, self.hop_length, self.downsample_factor, early_downsample_filter, \ - self.earlydownsample = get_early_downsample_params(sr, - hop_length, - fmax_t, - Q, - self.n_octaves, - verbose) - - self.register_buffer('early_downsample_filter', early_downsample_filter) + ( + sr, + self.hop_length, + self.downsample_factor, + early_downsample_filter, + self.earlydownsample, + ) = get_early_downsample_params( + sr, hop_length, fmax_t, Q, self.n_octaves, verbose + ) + + self.register_buffer("early_downsample_filter", early_downsample_filter) if verbose == True: - print("Early downsampling filter created, \ - time used = {:.4f} seconds".format(time() - start)) + print( + "Early downsampling filter created, \ + time used = {:.4f} seconds".format( + time() - start + ) + ) else: - self.downsample_factor = 1. + self.downsample_factor = 1.0 # Preparing CQT kernels if verbose == True: - print("Creating CQT kernels ...", end='\r') + print("Creating CQT kernels ...", end="\r") start = time() # print("Q = {}, fmin_t = {}, n_filters = {}".format(Q, self.fmin_t, n_filters)) - basis, self.n_fft, _, _ = create_cqt_kernels(Q, - sr, - self.fmin_t, - n_filters, - bins_per_octave, - norm=basis_norm, - topbin_check=False) + basis, self.n_fft, _, _ = create_cqt_kernels( + Q, + sr, + self.fmin_t, + n_filters, + bins_per_octave, + norm=basis_norm, + topbin_check=False, + ) # This is for the normalization in the end freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave)) @@ -352,28 +409,33 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, b lenghts = np.ceil(Q * sr / freqs) lenghts = torch.tensor(lenghts).float() - self.register_buffer('lenghts', lenghts) + self.register_buffer("lenghts", lenghts) self.basis = basis - fft_basis = fft(basis)[:, :self.n_fft // 2 + 1] # Convert CQT kenral from time domain to freq domain + fft_basis = fft(basis)[ + :, : self.n_fft // 2 + 1 + ] # Convert CQT kenral from time domain to freq domain # These cqt_kernel is already in the frequency domain cqt_kernels_real = torch.tensor(fft_basis.real) cqt_kernels_imag = torch.tensor(fft_basis.imag) if verbose == True: - print("CQT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "CQT kernels created, time used = {:.4f} seconds".format(time() - start) + ) # print("Getting cqt kernel done, n_fft = ",self.n_fft) # Preparing kernels for Short-Time Fourier Transform (STFT) # We set the frequency range in the CQT filter instead of here. if verbose == True: - print("Creating STFT kernels ...", end='\r') + print("Creating STFT kernels ...", end="\r") start = time() - kernel_sin, kernel_cos, self.bins2freq, _, window = create_fourier_kernels(self.n_fft, window='ones', - freq_scale='no') + kernel_sin, kernel_cos, self.bins2freq, _, window = create_fourier_kernels( + self.n_fft, window="ones", freq_scale="no" + ) wsin = kernel_sin * window wcos = kernel_cos * window @@ -381,34 +443,42 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, b wcos = torch.tensor(wcos) if verbose == True: - print("STFT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "STFT kernels created, time used = {:.4f} seconds".format( + time() - start + ) + ) if trainable_STFT: wsin = nn.Parameter(wsin, requires_grad=trainable_STFT) wcos = nn.Parameter(wcos, requires_grad=trainable_STFT) - self.register_parameter('wsin', wsin) - self.register_parameter('wcos', wcos) + self.register_parameter("wsin", wsin) + self.register_parameter("wcos", wcos) else: - self.register_buffer('wsin', wsin) - self.register_buffer('wcos', wcos) + self.register_buffer("wsin", wsin) + self.register_buffer("wcos", wcos) if trainable_CQT: - cqt_kernels_real = nn.Parameter(cqt_kernels_real, requires_grad=trainable_CQT) - cqt_kernels_imag = nn.Parameter(cqt_kernels_imag, requires_grad=trainable_CQT) - self.register_parameter('cqt_kernels_real', cqt_kernels_real) - self.register_parameter('cqt_kernels_imag', cqt_kernels_imag) + cqt_kernels_real = nn.Parameter( + cqt_kernels_real, requires_grad=trainable_CQT + ) + cqt_kernels_imag = nn.Parameter( + cqt_kernels_imag, requires_grad=trainable_CQT + ) + self.register_parameter("cqt_kernels_real", cqt_kernels_real) + self.register_parameter("cqt_kernels_imag", cqt_kernels_imag) else: - self.register_buffer('cqt_kernels_real', cqt_kernels_real) - self.register_buffer('cqt_kernels_imag', cqt_kernels_imag) + self.register_buffer("cqt_kernels_real", cqt_kernels_real) + self.register_buffer("cqt_kernels_imag", cqt_kernels_imag) # If center==True, the STFT window will be put in the middle, and paddings at the beginning # and ending are required. - if self.pad_mode == 'constant': + if self.pad_mode == "constant": self.padding = nn.ConstantPad1d(self.n_fft // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": self.padding = nn.ReflectionPad1d(self.n_fft // 2) - def forward(self, x, output_format=None, normalization_type='librosa'): + def forward(self, x, output_format=None, normalization_type="librosa"): """ Convert a batch of waveforms to CQT spectrograms. @@ -425,46 +495,65 @@ def forward(self, x, output_format=None, normalization_type='librosa'): x = broadcast_dim(x) if self.earlydownsample == True: - x = downsampling_by_n(x, self.early_downsample_filter, self.downsample_factor) + x = downsampling_by_n( + x, self.early_downsample_filter, self.downsample_factor + ) hop = self.hop_length - CQT = get_cqt_complex2(x, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding, - wcos=self.wcos, wsin=self.wsin) + CQT = get_cqt_complex2( + x, + self.cqt_kernels_real, + self.cqt_kernels_imag, + hop, + self.padding, + wcos=self.wcos, + wsin=self.wsin, + ) x_down = x # Preparing a new variable for downsampling for i in range(self.n_octaves - 1): hop = hop // 2 x_down = downsampling_by_2(x_down, self.lowpass_filter) - CQT1 = get_cqt_complex2(x_down, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding, - wcos=self.wcos, wsin=self.wsin) + CQT1 = get_cqt_complex2( + x_down, + self.cqt_kernels_real, + self.cqt_kernels_imag, + hop, + self.padding, + wcos=self.wcos, + wsin=self.wsin, + ) CQT = torch.cat((CQT1, CQT), 1) - CQT = CQT[:, -self.n_bins:, :] # Removing unwanted top bins + CQT = CQT[:, -self.n_bins :, :] # Removing unwanted top bins - if normalization_type == 'librosa': + if normalization_type == "librosa": CQT *= torch.sqrt(self.lenghts.view(-1, 1, 1)) / self.n_fft - elif normalization_type == 'convolutional': + elif normalization_type == "convolutional": pass - elif normalization_type == 'wrap': + elif normalization_type == "wrap": CQT *= 2 / self.n_fft else: - raise ValueError("The normalization_type %r is not part of our current options." % normalization_type) + raise ValueError( + "The normalization_type %r is not part of our current options." + % normalization_type + ) - if output_format == 'Magnitude': + if output_format == "Magnitude": # Getting CQT Amplitude return torch.sqrt(CQT.pow(2).sum(-1)) - elif output_format == 'Complex': + elif output_format == "Complex": return CQT - elif output_format == 'Phase': + elif output_format == "Phase": phase_real = torch.cos(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0])) phase_imag = torch.sin(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0])) return torch.stack((phase_real, phase_imag), -1) def extra_repr(self) -> str: - return 'STFT kernel size = {}, CQT kernel size = {}'.format( + return "STFT kernel size = {}, CQT kernel size = {}".format( (*self.wcos.shape,), (*self.cqt_kernels_real.shape,) ) @@ -563,9 +652,23 @@ class CQT1992v2(nn.Module): >>> specs = spec_layer(x) """ - def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, - bins_per_octave=12, filter_scale=1, norm=1, window='hann', center=True, pad_mode='reflect', - trainable=False, output_format='Magnitude', verbose=True): + def __init__( + self, + sr=22050, + hop_length=512, + fmin=32.70, + fmax=None, + n_bins=84, + bins_per_octave=12, + filter_scale=1, + norm=1, + window="hann", + center=True, + pad_mode="reflect", + trainable=False, + output_format="Magnitude", + verbose=True, + ): super().__init__() @@ -579,19 +682,14 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, Q = float(filter_scale) / (2 ** (1 / bins_per_octave) - 1) if verbose == True: - print("Creating CQT kernels ...", end='\r') + print("Creating CQT kernels ...", end="\r") start = time() - cqt_kernels, self.kernel_width, lenghts, freqs = create_cqt_kernels(Q, - sr, - fmin, - n_bins, - bins_per_octave, - norm, - window, - fmax) - - self.register_buffer('lenghts', lenghts) + cqt_kernels, self.kernel_width, lenghts, freqs = create_cqt_kernels( + Q, sr, fmin, n_bins, bins_per_octave, norm, window, fmax + ) + + self.register_buffer("lenghts", lenghts) self.frequencies = freqs cqt_kernels_real = torch.tensor(cqt_kernels.real).unsqueeze(1) @@ -600,16 +698,18 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, if trainable: cqt_kernels_real = nn.Parameter(cqt_kernels_real, requires_grad=trainable) cqt_kernels_imag = nn.Parameter(cqt_kernels_imag, requires_grad=trainable) - self.register_parameter('cqt_kernels_real', cqt_kernels_real) - self.register_parameter('cqt_kernels_imag', cqt_kernels_imag) + self.register_parameter("cqt_kernels_real", cqt_kernels_real) + self.register_parameter("cqt_kernels_imag", cqt_kernels_imag) else: - self.register_buffer('cqt_kernels_real', cqt_kernels_real) - self.register_buffer('cqt_kernels_imag', cqt_kernels_imag) + self.register_buffer("cqt_kernels_real", cqt_kernels_real) + self.register_buffer("cqt_kernels_imag", cqt_kernels_imag) if verbose == True: - print("CQT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "CQT kernels created, time used = {:.4f} seconds".format(time() - start) + ) - def forward(self, x, output_format=None, normalization_type='librosa'): + def forward(self, x, output_format=None, normalization_type="librosa"): """ Convert a batch of waveforms to CQT spectrograms. @@ -638,9 +738,9 @@ def forward(self, x, output_format=None, normalization_type='librosa'): x = broadcast_dim(x) if self.center: - if self.pad_mode == 'constant': + if self.pad_mode == "constant": padding = nn.ConstantPad1d(self.kernel_width // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": padding = nn.ReflectionPad1d(self.kernel_width // 2) x = padding(x) @@ -649,18 +749,21 @@ def forward(self, x, output_format=None, normalization_type='librosa'): CQT_real = conv1d(x, self.cqt_kernels_real, stride=self.hop_length) CQT_imag = -conv1d(x, self.cqt_kernels_imag, stride=self.hop_length) - if normalization_type == 'librosa': + if normalization_type == "librosa": CQT_real *= torch.sqrt(self.lenghts.view(-1, 1)) CQT_imag *= torch.sqrt(self.lenghts.view(-1, 1)) - elif normalization_type == 'convolutional': + elif normalization_type == "convolutional": pass - elif normalization_type == 'wrap': + elif normalization_type == "wrap": CQT_real *= 2 CQT_imag *= 2 else: - raise ValueError("The normalization_type %r is not part of our current options." % normalization_type) + raise ValueError( + "The normalization_type %r is not part of our current options." + % normalization_type + ) - if output_format == 'Magnitude': + if output_format == "Magnitude": if self.trainable == False: # Getting CQT Amplitude CQT = torch.sqrt(CQT_real.pow(2) + CQT_imag.pow(2)) @@ -668,10 +771,10 @@ def forward(self, x, output_format=None, normalization_type='librosa'): CQT = torch.sqrt(CQT_real.pow(2) + CQT_imag.pow(2) + 1e-8) return CQT - elif output_format == 'Complex': + elif output_format == "Complex": return torch.stack((CQT_real, CQT_imag), -1) - elif output_format == 'Phase': + elif output_format == "Phase": phase_real = torch.cos(torch.atan2(CQT_imag, CQT_real)) phase_imag = torch.sin(torch.atan2(CQT_imag, CQT_real)) return torch.stack((phase_real, phase_imag), -1) @@ -683,9 +786,9 @@ def forward_manual(self, x): x = broadcast_dim(x) if self.center: - if self.pad_mode == 'constant': + if self.pad_mode == "constant": padding = nn.ConstantPad1d(self.kernel_width // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": padding = nn.ReflectionPad1d(self.kernel_width // 2) x = padding(x) @@ -795,18 +898,37 @@ class CQT2010v2(nn.Module): # To DO: # need to deal with the filter and other tensors - def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, filter_scale=1, - bins_per_octave=12, norm=True, basis_norm=1, window='hann', pad_mode='reflect', - earlydownsample=True, trainable=False, output_format='Magnitude', verbose=True): + def __init__( + self, + sr=22050, + hop_length=512, + fmin=32.70, + fmax=None, + n_bins=84, + filter_scale=1, + bins_per_octave=12, + norm=True, + basis_norm=1, + window="hann", + pad_mode="reflect", + earlydownsample=True, + trainable=False, + output_format="Magnitude", + verbose=True, + ): super().__init__() - self.norm = norm # Now norm is used to normalize the final CQT result by dividing n_fft + self.norm = ( + norm # Now norm is used to normalize the final CQT result by dividing n_fft + ) # basis_norm is for normalizing basis self.hop_length = hop_length self.pad_mode = pad_mode self.n_bins = n_bins - self.earlydownsample = earlydownsample # We will activate early downsampling later if possible + self.earlydownsample = ( + earlydownsample # We will activate early downsampling later if possible + ) self.trainable = trainable self.output_format = output_format @@ -815,23 +937,27 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, f # Creating lowpass filter and make it a torch tensor if verbose == True: - print("Creating low pass filter ...", end='\r') + print("Creating low pass filter ...", end="\r") start = time() # self.lowpass_filter = torch.tensor( # create_lowpass_filter( # band_center = 0.50, # kernelLength=256, # transitionBandwidth=0.001)) - lowpass_filter = torch.tensor(create_lowpass_filter( - band_center=0.50, - kernelLength=256, - transitionBandwidth=0.001) + lowpass_filter = torch.tensor( + create_lowpass_filter( + band_center=0.50, kernelLength=256, transitionBandwidth=0.001 + ) ) # Broadcast the tensor to the shape that fits conv1d - self.register_buffer('lowpass_filter', lowpass_filter[None, None, :]) + self.register_buffer("lowpass_filter", lowpass_filter[None, None, :]) if verbose == True: - print("Low pass filter created, time used = {:.4f} seconds".format(time() - start)) + print( + "Low pass filter created, time used = {:.4f} seconds".format( + time() - start + ) + ) # Caluate num of filter requires for the kernel # n_octaves determines how many resampling requires for the CQT @@ -852,41 +978,57 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, f # Calculate the top bin frequency fmax_t = self.fmin_t * 2 ** ((remainder - 1) / bins_per_octave) - self.fmin_t = fmax_t / 2 ** (1 - 1 / bins_per_octave) # Adjusting the top minium bins + self.fmin_t = fmax_t / 2 ** ( + 1 - 1 / bins_per_octave + ) # Adjusting the top minium bins if fmax_t > sr / 2: - raise ValueError('The top bin {}Hz has exceeded the Nyquist frequency, \ - please reduce the n_bins'.format(fmax_t)) - - if self.earlydownsample == True: # Do early downsampling if this argument is True + raise ValueError( + "The top bin {}Hz has exceeded the Nyquist frequency, \ + please reduce the n_bins".format( + fmax_t + ) + ) + + if ( + self.earlydownsample == True + ): # Do early downsampling if this argument is True if verbose == True: - print("Creating early downsampling filter ...", end='\r') + print("Creating early downsampling filter ...", end="\r") start = time() - sr, self.hop_length, self.downsample_factor, early_downsample_filter, \ - self.earlydownsample = get_early_downsample_params(sr, - hop_length, - fmax_t, - Q, - self.n_octaves, - verbose) - self.register_buffer('early_downsample_filter', early_downsample_filter) + ( + sr, + self.hop_length, + self.downsample_factor, + early_downsample_filter, + self.earlydownsample, + ) = get_early_downsample_params( + sr, hop_length, fmax_t, Q, self.n_octaves, verbose + ) + self.register_buffer("early_downsample_filter", early_downsample_filter) if verbose == True: - print("Early downsampling filter created, \ - time used = {:.4f} seconds".format(time() - start)) + print( + "Early downsampling filter created, \ + time used = {:.4f} seconds".format( + time() - start + ) + ) else: - self.downsample_factor = 1. + self.downsample_factor = 1.0 # Preparing CQT kernels if verbose == True: - print("Creating CQT kernels ...", end='\r') + print("Creating CQT kernels ...", end="\r") start = time() - basis, self.n_fft, lenghts, _ = create_cqt_kernels(Q, - sr, - self.fmin_t, - n_filters, - bins_per_octave, - norm=basis_norm, - topbin_check=False) + basis, self.n_fft, lenghts, _ = create_cqt_kernels( + Q, + sr, + self.fmin_t, + n_filters, + bins_per_octave, + norm=basis_norm, + topbin_check=False, + ) # For normalization in the end # The freqs returned by create_cqt_kernels cannot be used # Since that returns only the top octave bins @@ -896,7 +1038,7 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, f lenghts = np.ceil(Q * sr / freqs) lenghts = torch.tensor(lenghts).float() - self.register_buffer('lenghts', lenghts) + self.register_buffer("lenghts", lenghts) self.basis = basis # These cqt_kernel is already in the frequency domain @@ -906,24 +1048,26 @@ def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, f if trainable: cqt_kernels_real = nn.Parameter(cqt_kernels_real, requires_grad=trainable) cqt_kernels_imag = nn.Parameter(cqt_kernels_imag, requires_grad=trainable) - self.register_parameter('cqt_kernels_real', cqt_kernels_real) - self.register_parameter('cqt_kernels_imag', cqt_kernels_imag) + self.register_parameter("cqt_kernels_real", cqt_kernels_real) + self.register_parameter("cqt_kernels_imag", cqt_kernels_imag) else: - self.register_buffer('cqt_kernels_real', cqt_kernels_real) - self.register_buffer('cqt_kernels_imag', cqt_kernels_imag) + self.register_buffer("cqt_kernels_real", cqt_kernels_real) + self.register_buffer("cqt_kernels_imag", cqt_kernels_imag) if verbose == True: - print("CQT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "CQT kernels created, time used = {:.4f} seconds".format(time() - start) + ) # print("Getting cqt kernel done, n_fft = ",self.n_fft) # If center==True, the STFT window will be put in the middle, and paddings at the beginning # and ending are required. - if self.pad_mode == 'constant': + if self.pad_mode == "constant": self.padding = nn.ConstantPad1d(self.n_fft // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": self.padding = nn.ReflectionPad1d(self.n_fft // 2) - def forward(self, x, output_format=None, normalization_type='librosa'): + def forward(self, x, output_format=None, normalization_type="librosa"): """ Convert a batch of waveforms to CQT spectrograms. @@ -940,20 +1084,25 @@ def forward(self, x, output_format=None, normalization_type='librosa'): x = broadcast_dim(x) if self.earlydownsample == True: - x = downsampling_by_n(x, self.early_downsample_filter, self.downsample_factor) + x = downsampling_by_n( + x, self.early_downsample_filter, self.downsample_factor + ) hop = self.hop_length - CQT = get_cqt_complex(x, self.cqt_kernels_real, self.cqt_kernels_imag, hop, - self.padding) # Getting the top octave CQT + CQT = get_cqt_complex( + x, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding + ) # Getting the top octave CQT x_down = x # Preparing a new variable for downsampling for i in range(self.n_octaves - 1): hop = hop // 2 x_down = downsampling_by_2(x_down, self.lowpass_filter) - CQT1 = get_cqt_complex(x_down, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding) + CQT1 = get_cqt_complex( + x_down, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding + ) CQT = torch.cat((CQT1, CQT), 1) - CQT = CQT[:, -self.n_bins:, :] # Removing unwanted bottom bins + CQT = CQT[:, -self.n_bins :, :] # Removing unwanted bottom bins # print("downsample_factor = ",self.downsample_factor) # print(CQT.shape) # print(self.lenghts.view(-1,1).shape) @@ -962,26 +1111,29 @@ def forward(self, x, output_format=None, normalization_type='librosa'): # same mag as 1992 CQT = CQT * self.downsample_factor # Normalize again to get same result as librosa - if normalization_type == 'librosa': + if normalization_type == "librosa": CQT = CQT * torch.sqrt(self.lenghts.view(-1, 1, 1)) - elif normalization_type == 'convolutional': + elif normalization_type == "convolutional": pass - elif normalization_type == 'wrap': + elif normalization_type == "wrap": CQT *= 2 else: - raise ValueError("The normalization_type %r is not part of our current options." % normalization_type) + raise ValueError( + "The normalization_type %r is not part of our current options." + % normalization_type + ) - if output_format == 'Magnitude': + if output_format == "Magnitude": if self.trainable == False: # Getting CQT Amplitude return torch.sqrt(CQT.pow(2).sum(-1)) else: return torch.sqrt(CQT.pow(2).sum(-1) + 1e-8) - elif output_format == 'Complex': + elif output_format == "Complex": return CQT - elif output_format == 'Phase': + elif output_format == "Phase": phase_real = torch.cos(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0])) phase_imag = torch.sin(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0])) return torch.stack((phase_real, phase_imag), -1) @@ -989,4 +1141,5 @@ def forward(self, x, output_format=None, normalization_type='librosa'): class CQT(CQT1992v2): """An abbreviation for :func:`~nnAudio.Spectrogram.CQT1992v2`. Please refer to the :func:`~nnAudio.Spectrogram.CQT1992v2` documentation""" - pass \ No newline at end of file + + pass diff --git a/Installation/nnAudio/features/gammatone.py b/Installation/nnAudio/features/gammatone.py index 74d171b..4069a43 100644 --- a/Installation/nnAudio/features/gammatone.py +++ b/Installation/nnAudio/features/gammatone.py @@ -52,9 +52,24 @@ class Gammatonegram(nn.Module): >>> specs = spec_layer(x) """ - def __init__(self, sr=44100, n_fft=2048, n_bins=64, hop_length=512, window='hann', center=True, pad_mode='reflect', - power=2.0, htk=False, fmin=20.0, fmax=None, norm=1, trainable_bins=False, trainable_STFT=False, - verbose=True): + def __init__( + self, + sr=44100, + n_fft=2048, + n_bins=64, + hop_length=512, + window="hann", + center=True, + pad_mode="reflect", + power=2.0, + htk=False, + fmin=20.0, + fmax=None, + norm=1, + trainable_bins=False, + trainable_STFT=False, + verbose=True, + ): super(Gammatonegram, self).__init__() self.stride = hop_length self.center = center @@ -64,8 +79,9 @@ def __init__(self, sr=44100, n_fft=2048, n_bins=64, hop_length=512, window='hann # Create filter windows for stft start = time() - wsin, wcos, self.bins2freq, _, _ = create_fourier_kernels(n_fft, freq_bins=None, window=window, freq_scale='no', - sr=sr) + wsin, wcos, self.bins2freq, _, _ = create_fourier_kernels( + n_fft, freq_bins=None, window=window, freq_scale="no", sr=sr + ) wsin = torch.tensor(wsin, dtype=torch.float) wcos = torch.tensor(wcos, dtype=torch.float) @@ -73,11 +89,11 @@ def __init__(self, sr=44100, n_fft=2048, n_bins=64, hop_length=512, window='hann if trainable_STFT: wsin = nn.Parameter(wsin, requires_grad=trainable_STFT) wcos = nn.Parameter(wcos, requires_grad=trainable_STFT) - self.register_parameter('wsin', wsin) - self.register_parameter('wcos', wcos) + self.register_parameter("wsin", wsin) + self.register_parameter("wcos", wcos) else: - self.register_buffer('wsin', wsin) - self.register_buffer('wcos', wcos) + self.register_buffer("wsin", wsin) + self.register_buffer("wcos", wcos) # Creating kenral for Gammatone spectrogram start = time() @@ -85,17 +101,25 @@ def __init__(self, sr=44100, n_fft=2048, n_bins=64, hop_length=512, window='hann gammatone_basis = torch.tensor(gammatone_basis) if verbose == True: - print("STFT filter created, time used = {:.4f} seconds".format(time() - start)) - print("Gammatone filter created, time used = {:.4f} seconds".format(time() - start)) + print( + "STFT filter created, time used = {:.4f} seconds".format(time() - start) + ) + print( + "Gammatone filter created, time used = {:.4f} seconds".format( + time() - start + ) + ) else: pass # Making everything nn.Prarmeter, so that this model can support nn.DataParallel if trainable_bins: - gammatone_basis = nn.Parameter(gammatone_basis, requires_grad=trainable_bins) - self.register_parameter('gammatone_basis', gammatone_basis) + gammatone_basis = nn.Parameter( + gammatone_basis, requires_grad=trainable_bins + ) + self.register_parameter("gammatone_basis", gammatone_basis) else: - self.register_buffer('gammatone_basis', gammatone_basis) + self.register_buffer("gammatone_basis", gammatone_basis) # if trainable_mel==True: # self.mel_basis = nn.Parameter(self.mel_basis) @@ -106,15 +130,20 @@ def __init__(self, sr=44100, n_fft=2048, n_bins=64, hop_length=512, window='hann def forward(self, x): x = broadcast_dim(x) if self.center: - if self.pad_mode == 'constant': + if self.pad_mode == "constant": padding = nn.ConstantPad1d(self.n_fft // 2, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": padding = nn.ReflectionPad1d(self.n_fft // 2) x = padding(x) - spec = torch.sqrt(conv1d(x, self.wsin, stride=self.stride).pow(2) \ - + conv1d(x, self.wcos, stride=self.stride).pow(2)) ** self.power # Doing STFT by using conv1d + spec = ( + torch.sqrt( + conv1d(x, self.wsin, stride=self.stride).pow(2) + + conv1d(x, self.wcos, stride=self.stride).pow(2) + ) + ** self.power + ) # Doing STFT by using conv1d gammatonespec = torch.matmul(self.gammatone_basis, spec) - return gammatonespec \ No newline at end of file + return gammatonespec diff --git a/Installation/nnAudio/features/griffin_lim.py b/Installation/nnAudio/features/griffin_lim.py index 61cf2c8..86bb5db 100644 --- a/Installation/nnAudio/features/griffin_lim.py +++ b/Installation/nnAudio/features/griffin_lim.py @@ -44,16 +44,18 @@ class Griffin_Lim(nn.Module): """ - def __init__(self, - n_fft, - n_iter=32, - hop_length=None, - win_length=None, - window='hann', - center=True, - pad_mode='reflect', - momentum=0.99, - device='cpu'): + def __init__( + self, + n_fft, + n_iter=32, + hop_length=None, + win_length=None, + window="hann", + center=True, + pad_mode="reflect", + momentum=0.99, + device="cpu", + ): super().__init__() self.n_fft = n_fft @@ -73,10 +75,9 @@ def __init__(self, self.hop_length = hop_length # Creating window function for stft and istft later - self.w = torch.tensor(get_window(window, - int(self.win_length), - fftbins=True), - device=device).float() + self.w = torch.tensor( + get_window(window, int(self.win_length), fftbins=True), device=device + ).float() def forward(self, S): """ @@ -88,7 +89,9 @@ def forward(self, S): Spectrogram of the shape ``(batch, n_fft//2+1, timesteps)`` """ - assert S.dim() == 3, "Please make sure your input is in the shape of (batch, freq_bins, timesteps)" + assert ( + S.dim() == 3 + ), "Please make sure your input is in the shape of (batch, freq_bins, timesteps)" # Initializing Random Phase rand_phase = torch.randn(*S.shape, device=self.device) @@ -104,31 +107,42 @@ def forward(self, S): # spec2wav conversion # print(f'win_length={self.win_length}\tw={self.w.shape}') - inverse = torch.istft(S.unsqueeze(-1) * angles, - self.n_fft, - self.hop_length, - win_length=self.win_length, - window=self.w, - center=self.center) + inverse = torch.istft( + S.unsqueeze(-1) * angles, + self.n_fft, + self.hop_length, + win_length=self.win_length, + window=self.w, + center=self.center, + ) # wav2spec conversion - rebuilt = torch.stft(inverse, - self.n_fft, - self.hop_length, - win_length=self.win_length, - window=self.w, - pad_mode=self.pad_mode) + rebuilt = torch.stft( + inverse, + self.n_fft, + self.hop_length, + win_length=self.win_length, + window=self.w, + pad_mode=self.pad_mode, + ) # Phase update rule - angles[:, :, :] = rebuilt[:, :, :] - (self.momentum / (1 + self.momentum)) * tprev[:, :, :] + angles[:, :, :] = ( + rebuilt[:, :, :] + - (self.momentum / (1 + self.momentum)) * tprev[:, :, :] + ) # Phase normalization - angles = angles.div(torch.sqrt(angles.pow(2).sum(-1)).unsqueeze(-1) + 1e-16) # normalizing the phase + angles = angles.div( + torch.sqrt(angles.pow(2).sum(-1)).unsqueeze(-1) + 1e-16 + ) # normalizing the phase # Using the final phase to reconstruct the waveforms - inverse = torch.istft(S.unsqueeze(-1) * angles, - self.n_fft, - self.hop_length, - win_length=self.win_length, - window=self.w, - center=self.center) - return inverse \ No newline at end of file + inverse = torch.istft( + S.unsqueeze(-1) * angles, + self.n_fft, + self.hop_length, + win_length=self.win_length, + window=self.w, + center=self.center, + ) + return inverse diff --git a/Installation/nnAudio/features/mel.py b/Installation/nnAudio/features/mel.py index c26d22a..43a59f0 100644 --- a/Installation/nnAudio/features/mel.py +++ b/Installation/nnAudio/features/mel.py @@ -90,10 +90,26 @@ class MelSpectrogram(nn.Module): >>> specs = spec_layer(x) """ - def __init__(self, sr=22050, n_fft=2048, win_length=None, n_mels=128, hop_length=512, - window='hann', center=True, pad_mode='reflect', power=2.0, htk=False, - fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False, - verbose=True, **kwargs): + def __init__( + self, + sr=22050, + n_fft=2048, + win_length=None, + n_mels=128, + hop_length=512, + window="hann", + center=True, + pad_mode="reflect", + power=2.0, + htk=False, + fmin=0.0, + fmax=None, + norm=1, + trainable_mel=False, + trainable_STFT=False, + verbose=True, + **kwargs + ): super().__init__() self.stride = hop_length @@ -105,10 +121,21 @@ def __init__(self, sr=22050, n_fft=2048, win_length=None, n_mels=128, hop_length self.trainable_STFT = trainable_STFT # Preparing for the stft layer. No need for center - self.stft = STFT(n_fft=n_fft, win_length=win_length, freq_bins=None, - hop_length=hop_length, window=window, freq_scale='no', - center=center, pad_mode=pad_mode, sr=sr, trainable=trainable_STFT, - output_format="Magnitude", verbose=verbose, **kwargs) + self.stft = STFT( + n_fft=n_fft, + win_length=win_length, + freq_bins=None, + hop_length=hop_length, + window=window, + freq_scale="no", + center=center, + pad_mode=pad_mode, + sr=sr, + trainable=trainable_STFT, + output_format="Magnitude", + verbose=verbose, + **kwargs + ) # Create filter windows for stft start = time() @@ -119,17 +146,21 @@ def __init__(self, sr=22050, n_fft=2048, win_length=None, n_mels=128, hop_length mel_basis = torch.tensor(mel_basis) if verbose == True: - print("STFT filter created, time used = {:.4f} seconds".format(time() - start)) - print("Mel filter created, time used = {:.4f} seconds".format(time() - start)) + print( + "STFT filter created, time used = {:.4f} seconds".format(time() - start) + ) + print( + "Mel filter created, time used = {:.4f} seconds".format(time() - start) + ) else: pass if trainable_mel: # Making everything nn.Parameter, so that this model can support nn.DataParallel mel_basis = nn.Parameter(mel_basis, requires_grad=trainable_mel) - self.register_parameter('mel_basis', mel_basis) + self.register_parameter("mel_basis", mel_basis) else: - self.register_buffer('mel_basis', mel_basis) + self.register_buffer("mel_basis", mel_basis) # if trainable_mel==True: # self.mel_basis = nn.Parameter(self.mel_basis) @@ -152,13 +183,13 @@ def forward(self, x): """ x = broadcast_dim(x) - spec = self.stft(x, output_format='Magnitude') ** self.power + spec = self.stft(x, output_format="Magnitude") ** self.power melspec = torch.matmul(self.mel_basis, spec) return melspec def extra_repr(self) -> str: - return 'Mel filter banks size = {}, trainable_mel={}'.format( + return "Mel filter banks size = {}, trainable_mel={}".format( (*self.mel_basis.shape,), self.trainable_mel, self.trainable_STFT ) @@ -204,32 +235,42 @@ class MFCC(nn.Module): >>> mfcc = spec_layer(x) """ - def __init__(self, sr=22050, n_mfcc=20, norm='ortho', verbose=True, ref=1.0, amin=1e-10, top_db=80.0, **kwargs): + def __init__( + self, + sr=22050, + n_mfcc=20, + norm="ortho", + verbose=True, + ref=1.0, + amin=1e-10, + top_db=80.0, + **kwargs + ): super().__init__() self.melspec_layer = MelSpectrogram(sr=sr, verbose=verbose, **kwargs) self.m_mfcc = n_mfcc # attributes that will be used for _power_to_db if amin <= 0: - raise ParameterError('amin must be strictly positive') + raise ParameterError("amin must be strictly positive") amin = torch.tensor([amin]) ref = torch.abs(torch.tensor([ref])) - self.register_buffer('amin', amin) - self.register_buffer('ref', ref) + self.register_buffer("amin", amin) + self.register_buffer("ref", ref) self.top_db = top_db self.n_mfcc = n_mfcc def _power_to_db(self, S): - ''' + """ Refer to https://librosa.github.io/librosa/_modules/librosa/core/spectrum.html#power_to_db for the original implmentation. - ''' + """ log_spec = 10.0 * torch.log10(torch.max(S, self.amin)) log_spec -= 10.0 * torch.log10(torch.max(self.amin, self.ref)) if self.top_db is not None: if self.top_db < 0: - raise ParameterError('top_db must be non-negative') + raise ParameterError("top_db must be non-negative") # make the dim same as log_spec so that it can be broadcasted batch_wise_max = log_spec.flatten(1).max(1)[0].unsqueeze(1).unsqueeze(1) @@ -238,10 +279,12 @@ def _power_to_db(self, S): return log_spec def _dct(self, x, norm=None): - ''' + """ Refer to https://github.com/zh217/torch-dct for the original implmentation. - ''' - x = x.permute(0, 2, 1) # make freq the last axis, since dct applies to the frequency axis + """ + x = x.permute( + 0, 2, 1 + ) # make freq the last axis, since dct applies to the frequency axis x_shape = x.shape N = x_shape[-1] @@ -249,13 +292,13 @@ def _dct(self, x, norm=None): Vc = rfft_fn(v, 1, onesided=False) # TODO: Can make the W_r and W_i trainable here - k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N) + k = -torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N) W_r = torch.cos(k) W_i = torch.sin(k) V = Vc[:, :, :, 0] * W_r - Vc[:, :, :, 1] * W_i - if norm == 'ortho': + if norm == "ortho": V[:, :, 0] /= np.sqrt(N) * 2 V[:, :, 1:] /= np.sqrt(N / 2) * 2 @@ -279,10 +322,8 @@ def forward(self, x): x = self.melspec_layer(x) x = self._power_to_db(x) - x = self._dct(x, norm='ortho')[:, :self.m_mfcc, :] + x = self._dct(x, norm="ortho")[:, : self.m_mfcc, :] return x def extra_repr(self) -> str: - return 'n_mfcc = {}'.format( - (self.n_mfcc) - ) \ No newline at end of file + return "n_mfcc = {}".format((self.n_mfcc)) diff --git a/Installation/nnAudio/features/stft.py b/Installation/nnAudio/features/stft.py index 5d82cc8..c14c010 100644 --- a/Installation/nnAudio/features/stft.py +++ b/Installation/nnAudio/features/stft.py @@ -12,7 +12,9 @@ class STFTBase(nn.Module): STFT and iSTFT share the same `inverse_stft` function """ - def inverse_stft(self, X, kernel_cos, kernel_sin, onesided=True, length=None, refresh_win=True): + def inverse_stft( + self, X, kernel_cos, kernel_sin, onesided=True, length=None, refresh_win=True + ): # If the input spectrogram contains only half of the n_fft # Use extend_fbins function to get back another half if onesided: @@ -29,7 +31,7 @@ def inverse_stft(self, X, kernel_cos, kernel_sin, onesided=True, length=None, re real = real.squeeze(-2) * self.window_mask # Normalize the amplitude with n_fft - real /= (self.n_fft) + real /= self.n_fft # Overlap and Add algorithm to connect all the frames real = overlap_add(real, self.stride) @@ -37,21 +39,24 @@ def inverse_stft(self, X, kernel_cos, kernel_sin, onesided=True, length=None, re # Prepare the window sumsqure for division # Only need to create this window once to save time # Unless the input spectrograms have different time steps - if hasattr(self, 'w_sum') == False or refresh_win == True: - self.w_sum = torch_window_sumsquare(self.window_mask.flatten(), X.shape[2], self.stride, - self.n_fft).flatten() - self.nonzero_indices = (self.w_sum > 1e-10) + if hasattr(self, "w_sum") == False or refresh_win == True: + self.w_sum = torch_window_sumsquare( + self.window_mask.flatten(), X.shape[2], self.stride, self.n_fft + ).flatten() + self.nonzero_indices = self.w_sum > 1e-10 else: pass - real[:, self.nonzero_indices] = real[:, self.nonzero_indices].div(self.w_sum[self.nonzero_indices]) + real[:, self.nonzero_indices] = real[:, self.nonzero_indices].div( + self.w_sum[self.nonzero_indices] + ) # Remove padding if length is None: if self.center: - real = real[:, self.pad_amount:-self.pad_amount] + real = real[:, self.pad_amount : -self.pad_amount] else: if self.center: - real = real[:, self.pad_amount:self.pad_amount + length] + real = real[:, self.pad_amount : self.pad_amount + length] else: real = real[:, :length] @@ -145,16 +150,32 @@ class STFT(STFTBase): >>> specs = spec_layer(x) """ - def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, window='hann', - freq_scale='no', center=True, pad_mode='reflect', iSTFT=False, - fmin=50, fmax=6000, sr=22050, trainable=False, - output_format="Complex", verbose=True): + def __init__( + self, + n_fft=2048, + win_length=None, + freq_bins=None, + hop_length=None, + window="hann", + freq_scale="no", + center=True, + pad_mode="reflect", + iSTFT=False, + fmin=50, + fmax=6000, + sr=22050, + trainable=False, + output_format="Complex", + verbose=True, + ): super().__init__() # Trying to make the default setting same as librosa - if win_length == None: win_length = n_fft - if hop_length == None: hop_length = int(win_length // 4) + if win_length == None: + win_length = n_fft + if hop_length == None: + hop_length = int(win_length // 4) self.output_format = output_format self.trainable = trainable @@ -172,15 +193,23 @@ def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, start = time() # Create filter windows for stft - kernel_sin, kernel_cos, self.bins2freq, self.bin_list, window_mask = create_fourier_kernels(n_fft, - win_length=win_length, - freq_bins=freq_bins, - window=window, - freq_scale=freq_scale, - fmin=fmin, - fmax=fmax, - sr=sr, - verbose=verbose) + ( + kernel_sin, + kernel_cos, + self.bins2freq, + self.bin_list, + window_mask, + ) = create_fourier_kernels( + n_fft, + win_length=win_length, + freq_bins=freq_bins, + window=window, + freq_scale=freq_scale, + fmin=fmin, + fmax=fmax, + sr=sr, + verbose=verbose, + ) kernel_sin = torch.tensor(kernel_sin, dtype=torch.float) kernel_cos = torch.tensor(kernel_cos, dtype=torch.float) @@ -190,8 +219,8 @@ def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, kernel_cos_inv = torch.cat((kernel_cos, kernel_cos[1:-1].flip(0)), 0) if iSTFT: - self.register_buffer('kernel_sin_inv', kernel_sin_inv.unsqueeze(-1)) - self.register_buffer('kernel_cos_inv', kernel_cos_inv.unsqueeze(-1)) + self.register_buffer("kernel_sin_inv", kernel_sin_inv.unsqueeze(-1)) + self.register_buffer("kernel_cos_inv", kernel_cos_inv.unsqueeze(-1)) # Making all these variables nn.Parameter, so that the model can be used with nn.Parallel # self.kernel_sin = nn.Parameter(self.kernel_sin, requires_grad=self.trainable) @@ -203,20 +232,24 @@ def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, wcos = kernel_cos * window_mask if self.trainable == False: - self.register_buffer('wsin', wsin) - self.register_buffer('wcos', wcos) + self.register_buffer("wsin", wsin) + self.register_buffer("wcos", wcos) if self.trainable == True: wsin = nn.Parameter(wsin, requires_grad=self.trainable) wcos = nn.Parameter(wcos, requires_grad=self.trainable) - self.register_parameter('wsin', wsin) - self.register_parameter('wcos', wcos) + self.register_parameter("wsin", wsin) + self.register_parameter("wcos", wcos) # Prepare the shape of window mask so that it can be used later in inverse - self.register_buffer('window_mask', window_mask.unsqueeze(0).unsqueeze(-1)) + self.register_buffer("window_mask", window_mask.unsqueeze(0).unsqueeze(-1)) if verbose == True: - print("STFT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "STFT kernels created, time used = {:.4f} seconds".format( + time() - start + ) + ) else: pass @@ -243,35 +276,44 @@ def forward(self, x, output_format=None): x = broadcast_dim(x) if self.center: - if self.pad_mode == 'constant': + if self.pad_mode == "constant": padding = nn.ConstantPad1d(self.pad_amount, 0) - elif self.pad_mode == 'reflect': + elif self.pad_mode == "reflect": if self.num_samples < self.pad_amount: - raise AssertionError("Signal length shorter than reflect padding length (n_fft // 2).") + raise AssertionError( + "Signal length shorter than reflect padding length (n_fft // 2)." + ) padding = nn.ReflectionPad1d(self.pad_amount) x = padding(x) spec_imag = conv1d(x, self.wsin, stride=self.stride) - spec_real = conv1d(x, self.wcos, stride=self.stride) # Doing STFT by using conv1d + spec_real = conv1d( + x, self.wcos, stride=self.stride + ) # Doing STFT by using conv1d # remove redundant parts - spec_real = spec_real[:, :self.freq_bins, :] - spec_imag = spec_imag[:, :self.freq_bins, :] + spec_real = spec_real[:, : self.freq_bins, :] + spec_imag = spec_imag[:, : self.freq_bins, :] - if output_format == 'Magnitude': + if output_format == "Magnitude": spec = spec_real.pow(2) + spec_imag.pow(2) if self.trainable == True: - return torch.sqrt(spec + 1e-8) # prevent Nan gradient when sqrt(0) due to output=0 + return torch.sqrt( + spec + 1e-8 + ) # prevent Nan gradient when sqrt(0) due to output=0 else: return torch.sqrt(spec) - elif output_format == 'Complex': - return torch.stack((spec_real, -spec_imag), -1) # Remember the minus sign for imaginary part + elif output_format == "Complex": + return torch.stack( + (spec_real, -spec_imag), -1 + ) # Remember the minus sign for imaginary part - elif output_format == 'Phase': - return torch.atan2(-spec_imag + 0.0, - spec_real) # +0.0 removes -0.0 elements, which leads to error in calculating phase + elif output_format == "Phase": + return torch.atan2( + -spec_imag + 0.0, spec_real + ) # +0.0 removes -0.0 elements, which leads to error in calculating phase def inverse(self, X, onesided=True, length=None, refresh_win=True): """ @@ -297,16 +339,24 @@ def inverse(self, X, onesided=True, length=None, refresh_win=True): """ - if (hasattr(self, 'kernel_sin_inv') != True) or (hasattr(self, 'kernel_cos_inv') != True): - raise NameError("Please activate the iSTFT module by setting `iSTFT=True` if you want to use `inverse`") - - assert X.dim() == 4, "Inverse iSTFT only works for complex number," \ - "make sure our tensor is in the shape of (batch, freq_bins, timesteps, 2)." \ - "\nIf you have a magnitude spectrogram, please consider using Griffin-Lim." - return self.inverse_stft(X, self.kernel_cos_inv, self.kernel_sin_inv, onesided, length, refresh_win) + if (hasattr(self, "kernel_sin_inv") != True) or ( + hasattr(self, "kernel_cos_inv") != True + ): + raise NameError( + "Please activate the iSTFT module by setting `iSTFT=True` if you want to use `inverse`" + ) + + assert X.dim() == 4, ( + "Inverse iSTFT only works for complex number," + "make sure our tensor is in the shape of (batch, freq_bins, timesteps, 2)." + "\nIf you have a magnitude spectrogram, please consider using Griffin-Lim." + ) + return self.inverse_stft( + X, self.kernel_cos_inv, self.kernel_sin_inv, onesided, length, refresh_win + ) def extra_repr(self) -> str: - return 'n_fft={}, Fourier Kernel size={}, iSTFT={}, trainable={}'.format( + return "n_fft={}, Fourier Kernel size={}, iSTFT={}, trainable={}".format( self.n_fft, (*self.wsin.shape,), self.iSTFT, self.trainable ) @@ -386,15 +436,31 @@ class iSTFT(STFTBase): >>> specs = spec_layer(x) """ - def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, window='hann', - freq_scale='no', center=True, fmin=50, fmax=6000, sr=22050, trainable_kernels=False, - trainable_window=False, verbose=True, refresh_win=True): + def __init__( + self, + n_fft=2048, + win_length=None, + freq_bins=None, + hop_length=None, + window="hann", + freq_scale="no", + center=True, + fmin=50, + fmax=6000, + sr=22050, + trainable_kernels=False, + trainable_window=False, + verbose=True, + refresh_win=True, + ): super().__init__() # Trying to make the default setting same as librosa - if win_length == None: win_length = n_fft - if hop_length == None: hop_length = int(win_length // 4) + if win_length == None: + win_length = n_fft + if hop_length == None: + hop_length = int(win_length // 4) self.n_fft = n_fft self.win_length = win_length @@ -409,15 +475,17 @@ def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, # Create the window function and prepare the shape for batch-wise-time-wise multiplication # Create filter windows for inverse - kernel_sin, kernel_cos, _, _, window_mask = create_fourier_kernels(n_fft, - win_length=win_length, - freq_bins=n_fft, - window=window, - freq_scale=freq_scale, - fmin=fmin, - fmax=fmax, - sr=sr, - verbose=False) + kernel_sin, kernel_cos, _, _, window_mask = create_fourier_kernels( + n_fft, + win_length=win_length, + freq_bins=n_fft, + window=window, + freq_scale=freq_scale, + fmin=fmin, + fmax=fmax, + sr=sr, + verbose=False, + ) window_mask = get_window(window, int(win_length), fftbins=True) # For inverse, the Fourier kernels do not need to be windowed @@ -432,22 +500,26 @@ def __init__(self, n_fft=2048, win_length=None, freq_bins=None, hop_length=None, # Making all these variables trainable kernel_sin = nn.Parameter(kernel_sin, requires_grad=trainable_kernels) kernel_cos = nn.Parameter(kernel_cos, requires_grad=trainable_kernels) - self.register_parameter('kernel_sin', kernel_sin) - self.register_parameter('kernel_cos', kernel_cos) + self.register_parameter("kernel_sin", kernel_sin) + self.register_parameter("kernel_cos", kernel_cos) else: - self.register_buffer('kernel_sin', kernel_sin) - self.register_buffer('kernel_cos', kernel_cos) + self.register_buffer("kernel_sin", kernel_sin) + self.register_buffer("kernel_cos", kernel_cos) # Decide if the window function is trainable if trainable_window: window_mask = nn.Parameter(window_mask, requires_grad=trainable_window) - self.register_parameter('window_mask', window_mask) + self.register_parameter("window_mask", window_mask) else: - self.register_buffer('window_mask', window_mask) + self.register_buffer("window_mask", window_mask) if verbose == True: - print("iSTFT kernels created, time used = {:.4f} seconds".format(time() - start)) + print( + "iSTFT kernels created, time used = {:.4f} seconds".format( + time() - start + ) + ) else: pass @@ -464,7 +536,11 @@ def forward(self, X, onesided=False, length=None, refresh_win=None): if refresh_win == None: refresh_win = self.refresh_win - assert X.dim() == 4, "Inverse iSTFT only works for complex number," \ - "make sure our tensor is in the shape of (batch, freq_bins, timesteps, 2)" + assert X.dim() == 4, ( + "Inverse iSTFT only works for complex number," + "make sure our tensor is in the shape of (batch, freq_bins, timesteps, 2)" + ) - return self.inverse_stft(X, self.kernel_cos, self.kernel_sin, onesided, length, refresh_win) + return self.inverse_stft( + X, self.kernel_cos, self.kernel_sin, onesided, length, refresh_win + ) diff --git a/Installation/nnAudio/librosa_functions.py b/Installation/nnAudio/librosa_functions.py index 593da0f..d3eff99 100755 --- a/Installation/nnAudio/librosa_functions.py +++ b/Installation/nnAudio/librosa_functions.py @@ -6,11 +6,13 @@ import numpy as np import warnings + ### ----------------Functions for generating kenral for Mel Spectrogram------------ ### # This code is equalvant to from librosa.filters import mel # By doing so, we can run nnAudio without installing librosa -def fft2gammatonemx(sr=20000, n_fft=2048, n_bins=64, width=1.0, fmin=0.0, - fmax=11025, maxlen=1024): +def fft2gammatonemx( + sr=20000, n_fft=2048, n_bins=64, width=1.0, fmin=0.0, fmax=11025, maxlen=1024 +): """ # Ellis' description in MATLAB: # [wts,cfreqa] = fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen) @@ -34,64 +36,123 @@ def fft2gammatonemx(sr=20000, n_fft=2048, n_bins=64, width=1.0, fmin=0.0, wts = np.zeros([n_bins, n_fft], dtype=np.float32) # after Slaney's MakeERBFilters - EarQ = 9.26449; - minBW = 24.7; - order = 1; + EarQ = 9.26449 + minBW = 24.7 + order = 1 nFr = np.array(range(n_bins)) + 1 em = EarQ * minBW - cfreqs = (fmax + em) * np.exp(nFr * (-np.log(fmax + em) + np.log(fmin + em)) / n_bins) - em + cfreqs = (fmax + em) * np.exp( + nFr * (-np.log(fmax + em) + np.log(fmin + em)) / n_bins + ) - em cfreqs = cfreqs[::-1] GTord = 4 ucircArray = np.array(range(int(n_fft / 2 + 1))) - ucirc = np.exp(1j * 2 * np.pi * ucircArray / n_fft); + ucirc = np.exp(1j * 2 * np.pi * ucircArray / n_fft) # justpoles = 0 :taking out the 'if' corresponding to this. - ERB = width * np.power(np.power(cfreqs / EarQ, order) + np.power(minBW, order), 1 / order); - B = 1.019 * 2 * np.pi * ERB; + ERB = width * np.power( + np.power(cfreqs / EarQ, order) + np.power(minBW, order), 1 / order + ) + B = 1.019 * 2 * np.pi * ERB r = np.exp(-B / sr) theta = 2 * np.pi * cfreqs / sr pole = r * np.exp(1j * theta) T = 1 / sr - ebt = np.exp(B * T); - cpt = 2 * cfreqs * np.pi * T; - ccpt = 2 * T * np.cos(cpt); - scpt = 2 * T * np.sin(cpt); - A11 = -np.divide(np.divide(ccpt, ebt) + np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2); - A12 = -np.divide(np.divide(ccpt, ebt) - np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2); - A13 = -np.divide(np.divide(ccpt, ebt) + np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2); - A14 = -np.divide(np.divide(ccpt, ebt) - np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2); - zros = -np.array([A11, A12, A13, A14]) / T; + ebt = np.exp(B * T) + cpt = 2 * cfreqs * np.pi * T + ccpt = 2 * T * np.cos(cpt) + scpt = 2 * T * np.sin(cpt) + A11 = -np.divide( + np.divide(ccpt, ebt) + np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2 + ) + A12 = -np.divide( + np.divide(ccpt, ebt) - np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2 + ) + A13 = -np.divide( + np.divide(ccpt, ebt) + np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2 + ) + A14 = -np.divide( + np.divide(ccpt, ebt) - np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2 + ) + zros = -np.array([A11, A12, A13, A14]) / T wIdx = range(int(n_fft / 2 + 1)) - gain = np.abs((-2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp( - -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * ( - np.cos(2 * cfreqs * np.pi * T) - np.sqrt(3 - 2 ** (3 / 2)) * np.sin( - 2 * cfreqs * np.pi * T))) * (-2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp( - -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * (np.cos(2 * cfreqs * np.pi * T) + np.sqrt( - 3 - 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T))) * ( - -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp( - -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * ( - np.cos(2 * cfreqs * np.pi * T) - np.sqrt(3 + 2 ** (3 / 2)) * np.sin( - 2 * cfreqs * np.pi * T))) * ( - -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp( - -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * ( - np.cos(2 * cfreqs * np.pi * T) + np.sqrt(3 + 2 ** (3 / 2)) * np.sin( - 2 * cfreqs * np.pi * T))) / ( - -2 / np.exp(2 * B * T) - 2 * np.exp(4 * 1j * cfreqs * np.pi * T) + 2 * ( - 1 + np.exp(4 * 1j * cfreqs * np.pi * T)) / np.exp(B * T)) ** 4); + gain = np.abs( + ( + -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + + 2 + * np.exp(-(B * T) + 2 * 1j * cfreqs * np.pi * T) + * T + * ( + np.cos(2 * cfreqs * np.pi * T) + - np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T) + ) + ) + * ( + -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + + 2 + * np.exp(-(B * T) + 2 * 1j * cfreqs * np.pi * T) + * T + * ( + np.cos(2 * cfreqs * np.pi * T) + + np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T) + ) + ) + * ( + -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + + 2 + * np.exp(-(B * T) + 2 * 1j * cfreqs * np.pi * T) + * T + * ( + np.cos(2 * cfreqs * np.pi * T) + - np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T) + ) + ) + * ( + -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + + 2 + * np.exp(-(B * T) + 2 * 1j * cfreqs * np.pi * T) + * T + * ( + np.cos(2 * cfreqs * np.pi * T) + + np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T) + ) + ) + / ( + -2 / np.exp(2 * B * T) + - 2 * np.exp(4 * 1j * cfreqs * np.pi * T) + + 2 * (1 + np.exp(4 * 1j * cfreqs * np.pi * T)) / np.exp(B * T) + ) + ** 4 + ) # in MATLAB, there used to be 64 where here it says n_bins: - wts[:, wIdx] = ((T ** 4) / np.reshape(gain, (n_bins, 1))) * np.abs( - ucirc - np.reshape(zros[0], (n_bins, 1))) * np.abs(ucirc - np.reshape(zros[1], (n_bins, 1))) * np.abs( - ucirc - np.reshape(zros[2], (n_bins, 1))) * np.abs(ucirc - np.reshape(zros[3], (n_bins, 1))) * (np.abs( - np.power(np.multiply(np.reshape(pole, (n_bins, 1)) - ucirc, np.conj(np.reshape(pole, (n_bins, 1))) - ucirc), - -GTord))); - wts = wts[:, range(maxlen)]; + wts[:, wIdx] = ( + ((T ** 4) / np.reshape(gain, (n_bins, 1))) + * np.abs(ucirc - np.reshape(zros[0], (n_bins, 1))) + * np.abs(ucirc - np.reshape(zros[1], (n_bins, 1))) + * np.abs(ucirc - np.reshape(zros[2], (n_bins, 1))) + * np.abs(ucirc - np.reshape(zros[3], (n_bins, 1))) + * ( + np.abs( + np.power( + np.multiply( + np.reshape(pole, (n_bins, 1)) - ucirc, + np.conj(np.reshape(pole, (n_bins, 1))) - ucirc, + ), + -GTord, + ) + ) + ) + ) + wts = wts[:, range(maxlen)] return wts, cfreqs -def gammatone(sr, n_fft, n_bins=64, fmin=20.0, fmax=None, htk=False, - norm=1, dtype=np.float32): + +def gammatone( + sr, n_fft, n_bins=64, fmin=20.0, fmax=None, htk=False, norm=1, dtype=np.float32 +): """Create a Filterbank matrix to combine FFT bins into Gammatone bins Parameters ---------- @@ -124,10 +185,18 @@ def gammatone(sr, n_fft, n_bins=64, fmin=20.0, fmax=None, htk=False, if fmax is None: fmax = float(sr) / 2 n_bins = int(n_bins) - - weights,_ = fft2gammatonemx(sr=sr, n_fft=n_fft, n_bins=n_bins, fmin=fmin, fmax=fmax, maxlen=int(n_fft//2+1)) - return (1/n_fft)*weights + weights, _ = fft2gammatonemx( + sr=sr, + n_fft=n_fft, + n_bins=n_bins, + fmin=fmin, + fmax=fmax, + maxlen=int(n_fft // 2 + 1), + ) + + return (1 / n_fft) * weights + def mel_to_hz(mels, htk=False): """Convert mel bin numbers to frequencies @@ -155,7 +224,7 @@ def mel_to_hz(mels, htk=False): mels = np.asanyarray(mels) if htk: - return 700.0 * (10.0**(mels / 2595.0) - 1.0) + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) # Fill in the linear scale f_min = 0.0 @@ -163,13 +232,13 @@ def mel_to_hz(mels, htk=False): freqs = f_min + f_sp * mels # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region if mels.ndim: # If we have vector data, vectorize - log_t = (mels >= min_log_mel) + log_t = mels >= min_log_mel freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) elif mels >= min_log_mel: # If we have scalar data, check directly @@ -177,6 +246,7 @@ def mel_to_hz(mels, htk=False): return freqs + def hz_to_mel(frequencies, htk=False): """Convert Hz to Mels Examples @@ -213,22 +283,23 @@ def hz_to_mel(frequencies, htk=False): # Fill in the log-scale part - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = np.log(6.4) / 27.0 # step size for log region + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region if frequencies.ndim: # If we have array data, vectorize - log_t = (frequencies >= min_log_hz) - mels[log_t] = min_log_mel + np.log(frequencies[log_t]/min_log_hz) / logstep + log_t = frequencies >= min_log_hz + mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep elif frequencies >= min_log_hz: # If we have scalar data, heck directly mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep return mels + def fft_frequencies(sr=22050, n_fft=2048): - '''Alternative implementation of `np.fft.fftfreq` + """Alternative implementation of `np.fft.fftfreq` Parameters ---------- sr : number > 0 [scalar] @@ -244,20 +315,18 @@ def fft_frequencies(sr=22050, n_fft=2048): >>> librosa.fft_frequencies(sr=22050, n_fft=16) array([ 0. , 1378.125, 2756.25 , 4134.375, 5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ]) - ''' + """ + + return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) - return np.linspace(0, - float(sr) / 2, - int(1 + n_fft//2), - endpoint=True) def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False): """ This function is cloned from librosa 0.7. - Please refer to the original + Please refer to the original `documentation `__ for more info. - + Parameters ---------- n_mels : int > 0 [scalar] @@ -302,13 +371,15 @@ def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False): return mel_to_hz(mels, htk=htk) -def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, - norm=1, dtype=np.float32): + +def mel( + sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1, dtype=np.float32 +): """ This function is cloned from librosa 0.7. - Please refer to the original + Please refer to the original `documentation `__ - for more info. + for more info. Create a Filterbank matrix to combine FFT bins into Mel-frequency bins @@ -343,7 +414,7 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, Notes ----- This function caches at level 10. - + Examples -------- >>> melfb = librosa.filters.mel(22050, 2048) @@ -374,7 +445,7 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, fmax = float(sr) / 2 if norm is not None and norm != 1 and norm != np.inf: - raise ParameterError('Unsupported norm: {}'.format(repr(norm))) + raise ParameterError("Unsupported norm: {}".format(repr(norm))) # Initialize the weights n_mels = int(n_mels) @@ -392,31 +463,35 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, for i in range(n_mels): # lower and upper slopes for all bins lower = -ramps[i] / fdiff[i] - upper = ramps[i+2] / fdiff[i+1] + upper = ramps[i + 2] / fdiff[i + 1] # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) if norm == 1: # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels]) + enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) weights *= enorm[:, np.newaxis] # Only check weights if f_mel[0] is positive if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): # This means we have an empty channel somewhere - warnings.warn('Empty filters detected in mel frequency basis. ' - 'Some channels will produce empty responses. ' - 'Try increasing your sampling rate (and fmax) or ' - 'reducing n_mels.') + warnings.warn( + "Empty filters detected in mel frequency basis. " + "Some channels will produce empty responses. " + "Try increasing your sampling rate (and fmax) or " + "reducing n_mels." + ) return weights + + ### ------------------End of Functions for generating kenral for Mel Spectrogram ----------------### ### ------------------Functions for making STFT same as librosa ---------------------------------### def pad_center(data, size, axis=-1, **kwargs): - '''Wrapper for np.pad to automatically center an array prior to padding. + """Wrapper for np.pad to automatically center an array prior to padding. This is analogous to `str.center()` Examples @@ -470,9 +545,9 @@ def pad_center(data, size, axis=-1, **kwargs): See Also -------- numpy.pad - ''' + """ - kwargs.setdefault('mode', 'constant') + kwargs.setdefault("mode", "constant") n = data.shape[axis] @@ -482,8 +557,9 @@ def pad_center(data, size, axis=-1, **kwargs): lengths[axis] = (lpad, int(size - n - lpad)) if lpad < 0: - raise ParameterError(('Target size ({:d}) must be ' - 'at least input size ({:d})').format(size, n)) + raise ParameterError( + ("Target size ({:d}) must be " "at least input size ({:d})").format(size, n) + ) return np.pad(data, lengths, **kwargs) @@ -493,7 +569,18 @@ def pad_center(data, size, axis=-1, **kwargs): ### ------------------Functions for making Chroma_stft same as librosa ---------------------------------### -def chroma(sr, n_fft, n_chroma=12, tuning=0.0, ctroct=5.0, octwidth=2, norm=2, base_c=True, dtype=np.float32): + +def chroma( + sr, + n_fft, + n_chroma=12, + tuning=0.0, + ctroct=5.0, + octwidth=2, + norm=2, + base_c=True, + dtype=np.float32, +): """Create a chroma filter bank. This creates a linear transformation matrix to project @@ -953,11 +1040,13 @@ def tiny(x): # Only floating types generate a tiny if np.issubdtype(x.dtype, np.floating) or np.issubdtype( - x.dtype, np.complexfloating + x.dtype, np.complexfloating ): dtype = x.dtype else: dtype = np.float32 return np.finfo(dtype).tiny + + ### ------------------End of functions for making Chroma_stft same as librosa ---------------------------### diff --git a/Installation/nnAudio/utils.py b/Installation/nnAudio/utils.py index e315b81..10e8220 100644 --- a/Installation/nnAudio/utils.py +++ b/Installation/nnAudio/utils.py @@ -16,17 +16,18 @@ from nnAudio.librosa_functions import * -sz_float = 4 # size of a float -epsilon = 1e-8 # fudge factor for normalization +sz_float = 4 # size of a float +epsilon = 1e-8 # fudge factor for normalization # Acquires and parses the PyTorch version __TORCH_GTE_1_7 = False -split_version = torch.__version__.split('.') +split_version = torch.__version__.split(".") major_version = int(split_version[0]) minor_version = int(split_version[1]) if major_version > 1 or (major_version == 1 and minor_version >= 7): __TORCH_GTE_1_7 = True import torch.fft + if "torch.fft" not in sys.modules: raise RuntimeError("torch.fft module available but not imported") @@ -38,27 +39,35 @@ def rfft_fn(x, n=None, onesided=False): else: return torch.rfft(x, n, onesided=onesided) + ## --------------------------- Filter Design ---------------------------## def torch_window_sumsquare(w, n_frames, stride, n_fft, power=2): - w_stacks = w.unsqueeze(-1).repeat((1,n_frames)).unsqueeze(0) + w_stacks = w.unsqueeze(-1).repeat((1, n_frames)).unsqueeze(0) # Window length + stride*(frames-1) - output_len = w_stacks.shape[1] + stride*(w_stacks.shape[2]-1) - return fold(w_stacks**power, (1,output_len), kernel_size=(1,n_fft), stride=stride) + output_len = w_stacks.shape[1] + stride * (w_stacks.shape[2] - 1) + return fold( + w_stacks ** power, (1, output_len), kernel_size=(1, n_fft), stride=stride + ) + def overlap_add(X, stride): n_fft = X.shape[1] - output_len = n_fft + stride*(X.shape[2]-1) - - return fold(X, (1,output_len), kernel_size=(1,n_fft), stride=stride).flatten(1) + output_len = n_fft + stride * (X.shape[2] - 1) + + return fold(X, (1, output_len), kernel_size=(1, n_fft), stride=stride).flatten(1) + -def uniform_distribution(r1,r2, *size, device): +def uniform_distribution(r1, r2, *size, device): return (r1 - r2) * torch.rand(*size, device=device) + r2 + def extend_fbins(X): """Extending the number of frequency bins from `n_fft//2+1` back to `n_fft` by - reversing all bins except DC and Nyquist and append it on top of existing spectrogram""" - X_upper = X[:,1:-1].flip(1) - X_upper[:,:,:,1] = -X_upper[:,:,:,1] # For the imaganinry part, it is an odd function + reversing all bins except DC and Nyquist and append it on top of existing spectrogram""" + X_upper = X[:, 1:-1].flip(1) + X_upper[:, :, :, 1] = -X_upper[ + :, :, :, 1 + ] # For the imaganinry part, it is an odd function return torch.cat((X[:, :, :], X_upper), 1) @@ -87,7 +96,7 @@ def downsampling_by_n(x, filterKernel, n): >>> x_down = downsampling_by_n(x, filterKernel) """ - x = conv1d(x,filterKernel,stride=n, padding=(filterKernel.shape[-1]-1)//2) + x = conv1d(x, filterKernel, stride=n, padding=(filterKernel.shape[-1] - 1) // 2) return x @@ -112,7 +121,7 @@ def downsampling_by_2(x, filterKernel): >>> x_down = downsampling_by_2(x, filterKernel) """ - x = conv1d(x,filterKernel,stride=2, padding=(filterKernel.shape[-1]-1)//2) + x = conv1d(x, filterKernel, stride=2, padding=(filterKernel.shape[-1] - 1) // 2) return x @@ -139,6 +148,7 @@ def nextpow2(A): return int(np.ceil(np.log2(A))) + ## Basic tools for computation ## def prepow2(A): """A helper function to calculate the next nearest number to the power of 2. @@ -184,8 +194,12 @@ def complex_mul(cqt_filter, stft): fourier_real = stft[0] fourier_imag = stft[1] - CQT_real = torch.matmul(cqt_filter_real, fourier_real) - torch.matmul(cqt_filter_imag, fourier_imag) - CQT_imag = torch.matmul(cqt_filter_real, fourier_imag) + torch.matmul(cqt_filter_imag, fourier_real) + CQT_real = torch.matmul(cqt_filter_real, fourier_real) - torch.matmul( + cqt_filter_imag, fourier_imag + ) + CQT_imag = torch.matmul(cqt_filter_real, fourier_imag) + torch.matmul( + cqt_filter_imag, fourier_real + ) return CQT_real, CQT_imag @@ -203,7 +217,9 @@ def broadcast_dim(x): elif x.dim() == 3: pass else: - raise ValueError("Only support input with shape = (batch, len) or shape = (len)") + raise ValueError( + "Only support input with shape = (batch, len) or shape = (len)" + ) return x @@ -213,17 +229,28 @@ def broadcast_dim_conv2d(x): """ if x.dim() == 3: - x = x[:, None, :,:] + x = x[:, None, :, :] else: - raise ValueError("Only support input with shape = (batch, len) or shape = (len)") + raise ValueError( + "Only support input with shape = (batch, len) or shape = (len)" + ) return x ## Kernal generation functions ## -def create_fourier_kernels(n_fft, win_length=None, freq_bins=None, fmin=50,fmax=6000, sr=44100, - freq_scale='linear', window='hann', verbose=True): - """ This function creates the Fourier Kernel for STFT, Melspectrogram and CQT. +def create_fourier_kernels( + n_fft, + win_length=None, + freq_bins=None, + fmin=50, + fmax=6000, + sr=44100, + freq_scale="linear", + window="hann", + verbose=True, +): + """This function creates the Fourier Kernel for STFT, Melspectrogram and CQT. Most of the parameters follow librosa conventions. Part of the code comes from pytorch_musicnet. https://github.com/jthickstun/pytorch_musicnet @@ -269,12 +296,14 @@ def create_fourier_kernels(n_fft, win_length=None, freq_bins=None, fmin=50,fmax= """ - if freq_bins==None: freq_bins = n_fft//2+1 - if win_length==None: win_length = n_fft + if freq_bins == None: + freq_bins = n_fft // 2 + 1 + if win_length == None: + win_length = n_fft - s = np.arange(0, n_fft, 1.) - wsin = np.empty((freq_bins,1,n_fft)) - wcos = np.empty((freq_bins,1,n_fft)) + s = np.arange(0, n_fft, 1.0) + wsin = np.empty((freq_bins, 1, n_fft)) + wcos = np.empty((freq_bins, 1, n_fft)) start_freq = fmin end_freq = fmax bins2freq = [] @@ -285,74 +314,110 @@ def create_fourier_kernels(n_fft, win_length=None, freq_bins=None, fmin=50,fmax= # Choosing window shape - window_mask = get_window(window,int(win_length), fftbins=True) + window_mask = get_window(window, int(win_length), fftbins=True) window_mask = pad_center(window_mask, n_fft) - if freq_scale == 'linear': - if verbose==True: - print(f"sampling rate = {sr}. Please make sure the sampling rate is correct in order to" - f"get a valid freq range") - start_bin = start_freq*n_fft/sr - scaling_ind = (end_freq-start_freq)*(n_fft/sr)/freq_bins + if freq_scale == "linear": + if verbose == True: + print( + f"sampling rate = {sr}. Please make sure the sampling rate is correct in order to" + f"get a valid freq range" + ) + start_bin = start_freq * n_fft / sr + scaling_ind = (end_freq - start_freq) * (n_fft / sr) / freq_bins - for k in range(freq_bins): # Only half of the bins contain useful info + for k in range(freq_bins): # Only half of the bins contain useful info # print("linear freq = {}".format((k*scaling_ind+start_bin)*sr/n_fft)) - bins2freq.append((k*scaling_ind+start_bin)*sr/n_fft) - binslist.append((k*scaling_ind+start_bin)) - wsin[k,0,:] = np.sin(2*np.pi*(k*scaling_ind+start_bin)*s/n_fft) - wcos[k,0,:] = np.cos(2*np.pi*(k*scaling_ind+start_bin)*s/n_fft) - - elif freq_scale == 'log': - if verbose==True: - print(f"sampling rate = {sr}. Please make sure the sampling rate is correct in order to" - f"get a valid freq range") - start_bin = start_freq*n_fft/sr - scaling_ind = np.log(end_freq/start_freq)/freq_bins - - for k in range(freq_bins): # Only half of the bins contain useful info + bins2freq.append((k * scaling_ind + start_bin) * sr / n_fft) + binslist.append((k * scaling_ind + start_bin)) + wsin[k, 0, :] = np.sin( + 2 * np.pi * (k * scaling_ind + start_bin) * s / n_fft + ) + wcos[k, 0, :] = np.cos( + 2 * np.pi * (k * scaling_ind + start_bin) * s / n_fft + ) + + elif freq_scale == "log": + if verbose == True: + print( + f"sampling rate = {sr}. Please make sure the sampling rate is correct in order to" + f"get a valid freq range" + ) + start_bin = start_freq * n_fft / sr + scaling_ind = np.log(end_freq / start_freq) / freq_bins + + for k in range(freq_bins): # Only half of the bins contain useful info # print("log freq = {}".format(np.exp(k*scaling_ind)*start_bin*sr/n_fft)) - bins2freq.append(np.exp(k*scaling_ind)*start_bin*sr/n_fft) - binslist.append((np.exp(k*scaling_ind)*start_bin)) - wsin[k,0,:] = np.sin(2*np.pi*(np.exp(k*scaling_ind)*start_bin)*s/n_fft) - wcos[k,0,:] = np.cos(2*np.pi*(np.exp(k*scaling_ind)*start_bin)*s/n_fft) - - elif freq_scale == 'no': - for k in range(freq_bins): # Only half of the bins contain useful info - bins2freq.append(k*sr/n_fft) + bins2freq.append(np.exp(k * scaling_ind) * start_bin * sr / n_fft) + binslist.append((np.exp(k * scaling_ind) * start_bin)) + wsin[k, 0, :] = np.sin( + 2 * np.pi * (np.exp(k * scaling_ind) * start_bin) * s / n_fft + ) + wcos[k, 0, :] = np.cos( + 2 * np.pi * (np.exp(k * scaling_ind) * start_bin) * s / n_fft + ) + + elif freq_scale == "no": + for k in range(freq_bins): # Only half of the bins contain useful info + bins2freq.append(k * sr / n_fft) binslist.append(k) - wsin[k,0,:] = np.sin(2*np.pi*k*s/n_fft) - wcos[k,0,:] = np.cos(2*np.pi*k*s/n_fft) + wsin[k, 0, :] = np.sin(2 * np.pi * k * s / n_fft) + wcos[k, 0, :] = np.cos(2 * np.pi * k * s / n_fft) else: print("Please select the correct frequency scale, 'linear' or 'log'") - return wsin.astype(np.float32),wcos.astype(np.float32), bins2freq, binslist, window_mask.astype(np.float32) + return ( + wsin.astype(np.float32), + wcos.astype(np.float32), + bins2freq, + binslist, + window_mask.astype(np.float32), + ) # Tools for CQT -def create_cqt_kernels(Q, fs, fmin, n_bins=84, bins_per_octave=12, norm=1, - window='hann', fmax=None, topbin_check=True): + +def create_cqt_kernels( + Q, + fs, + fmin, + n_bins=84, + bins_per_octave=12, + norm=1, + window="hann", + fmax=None, + topbin_check=True, +): """ Automatically create CQT kernels in time domain """ - fftLen = 2**nextpow2(np.ceil(Q * fs / fmin)) + fftLen = 2 ** nextpow2(np.ceil(Q * fs / fmin)) # minWin = 2**nextpow2(np.ceil(Q * fs / fmax)) - if (fmax != None) and (n_bins == None): - n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin)) # Calculate the number of bins + if (fmax != None) and (n_bins == None): + n_bins = np.ceil( + bins_per_octave * np.log2(fmax / fmin) + ) # Calculate the number of bins freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave)) - elif (fmax == None) and (n_bins != None): + elif (fmax == None) and (n_bins != None): freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave)) else: - warnings.warn('If fmax is given, n_bins will be ignored',SyntaxWarning) - n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin)) # Calculate the number of bins + warnings.warn("If fmax is given, n_bins will be ignored", SyntaxWarning) + n_bins = np.ceil( + bins_per_octave * np.log2(fmax / fmin) + ) # Calculate the number of bins freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave)) - if np.max(freqs) > fs/2 and topbin_check==True: - raise ValueError('The top bin {}Hz has exceeded the Nyquist frequency, \ - please reduce the n_bins'.format(np.max(freqs))) + if np.max(freqs) > fs / 2 and topbin_check == True: + raise ValueError( + "The top bin {}Hz has exceeded the Nyquist frequency, \ + please reduce the n_bins".format( + np.max(freqs) + ) + ) tempKernel = np.zeros((int(n_bins), int(fftLen)), dtype=np.complex64) specKernel = np.zeros((int(n_bins), int(fftLen)), dtype=np.complex64) @@ -363,17 +428,21 @@ def create_cqt_kernels(Q, fs, fmin, n_bins=84, bins_per_octave=12, norm=1, l = np.ceil(Q * fs / freq) # Centering the kernels - if l%2==1: # pad more zeros on RHS - start = int(np.ceil(fftLen / 2.0 - l / 2.0))-1 + if l % 2 == 1: # pad more zeros on RHS + start = int(np.ceil(fftLen / 2.0 - l / 2.0)) - 1 else: start = int(np.ceil(fftLen / 2.0 - l / 2.0)) - sig = get_window_dispatch(window,int(l), fftbins=True)*np.exp(np.r_[-l//2:l//2]*1j*2*np.pi*freq/fs)/l + sig = ( + get_window_dispatch(window, int(l), fftbins=True) + * np.exp(np.r_[-l // 2 : l // 2] * 1j * 2 * np.pi * freq / fs) + / l + ) - if norm: # Normalizing the filter # Trying to normalize like librosa - tempKernel[k, start:start + int(l)] = sig/np.linalg.norm(sig, norm) + if norm: # Normalizing the filter # Trying to normalize like librosa + tempKernel[k, start : start + int(l)] = sig / np.linalg.norm(sig, norm) else: - tempKernel[k, start:start + int(l)] = sig + tempKernel[k, start : start + int(l)] = sig # specKernel[k, :] = fft(tempKernel[k]) # return specKernel[:,:fftLen//2+1], fftLen, torch.tensor(lenghts).float() @@ -384,17 +453,22 @@ def get_window_dispatch(window, N, fftbins=True): if isinstance(window, str): return get_window(window, N, fftbins=fftbins) elif isinstance(window, tuple): - if window[0] == 'gaussian': + if window[0] == "gaussian": assert window[1] >= 0 - sigma = np.floor(- N / 2 / np.sqrt(- 2 * np.log(10**(- window[1] / 20)))) - return get_window(('gaussian', sigma), N, fftbins=fftbins) + sigma = np.floor(-N / 2 / np.sqrt(-2 * np.log(10 ** (-window[1] / 20)))) + return get_window(("gaussian", sigma), N, fftbins=fftbins) else: Warning("Tuple windows may have undesired behaviour regarding Q factor") elif isinstance(window, float): - Warning("You are using Kaiser window with beta factor " + str(window) + ". Correct behaviour not checked.") + Warning( + "You are using Kaiser window with beta factor " + + str(window) + + ". Correct behaviour not checked." + ) else: - raise Exception("The function get_window from scipy only supports strings, tuples and floats.") - + raise Exception( + "The function get_window from scipy only supports strings, tuples and floats." + ) def get_cqt_complex(x, cqt_kernels_real, cqt_kernels_imag, hop_length, padding): @@ -405,18 +479,27 @@ def get_cqt_complex(x, cqt_kernels_real, cqt_kernels_imag, hop_length, padding): # STFT, converting the audio input from time domain to frequency domain try: - x = padding(x) # When center == True, we need padding at the beginning and ending + x = padding( + x + ) # When center == True, we need padding at the beginning and ending except: - warnings.warn(f"\ninput size = {x.shape}\tkernel size = {cqt_kernels_real.shape[-1]}\n" - "padding with reflection mode might not be the best choice, try using constant padding", - UserWarning) - x = torch.nn.functional.pad(x, (cqt_kernels_real.shape[-1]//2, cqt_kernels_real.shape[-1]//2)) + warnings.warn( + f"\ninput size = {x.shape}\tkernel size = {cqt_kernels_real.shape[-1]}\n" + "padding with reflection mode might not be the best choice, try using constant padding", + UserWarning, + ) + x = torch.nn.functional.pad( + x, (cqt_kernels_real.shape[-1] // 2, cqt_kernels_real.shape[-1] // 2) + ) CQT_real = conv1d(x, cqt_kernels_real, stride=hop_length) CQT_imag = -conv1d(x, cqt_kernels_imag, stride=hop_length) - return torch.stack((CQT_real, CQT_imag),-1) + return torch.stack((CQT_real, CQT_imag), -1) + -def get_cqt_complex2(x, cqt_kernels_real, cqt_kernels_imag, hop_length, padding, wcos=None, wsin=None): +def get_cqt_complex2( + x, cqt_kernels_real, cqt_kernels_imag, hop_length, padding, wcos=None, wsin=None +): """Multiplying the STFT result with the cqt_kernel, check out the 1992 CQT paper [1] for how to multiple the STFT result with the CQT kernel [2] Brown, Judith C.C. and Miller Puckette. “An efficient algorithm for the calculation of @@ -424,29 +507,32 @@ def get_cqt_complex2(x, cqt_kernels_real, cqt_kernels_imag, hop_length, padding, # STFT, converting the audio input from time domain to frequency domain try: - x = padding(x) # When center == True, we need padding at the beginning and ending + x = padding( + x + ) # When center == True, we need padding at the beginning and ending except: - warnings.warn(f"\ninput size = {x.shape}\tkernel size = {cqt_kernels_real.shape[-1]}\n" - "padding with reflection mode might not be the best choice, try using constant padding", - UserWarning) - x = torch.nn.functional.pad(x, (cqt_kernels_real.shape[-1]//2, cqt_kernels_real.shape[-1]//2)) - - - - if wcos==None or wsin==None: + warnings.warn( + f"\ninput size = {x.shape}\tkernel size = {cqt_kernels_real.shape[-1]}\n" + "padding with reflection mode might not be the best choice, try using constant padding", + UserWarning, + ) + x = torch.nn.functional.pad( + x, (cqt_kernels_real.shape[-1] // 2, cqt_kernels_real.shape[-1] // 2) + ) + + if wcos == None or wsin == None: CQT_real = conv1d(x, cqt_kernels_real, stride=hop_length) CQT_imag = -conv1d(x, cqt_kernels_imag, stride=hop_length) - - else: + + else: fourier_real = conv1d(x, wcos, stride=hop_length) fourier_imag = conv1d(x, wsin, stride=hop_length) # Multiplying input with the CQT kernel in freq domain - CQT_real, CQT_imag = complex_mul((cqt_kernels_real, cqt_kernels_imag), - (fourier_real, fourier_imag)) - - return torch.stack((CQT_real, CQT_imag),-1) - + CQT_real, CQT_imag = complex_mul( + (cqt_kernels_real, cqt_kernels_imag), (fourier_real, fourier_imag) + ) + return torch.stack((CQT_real, CQT_imag), -1) def create_lowpass_filter(band_center=0.5, kernelLength=256, transitionBandwidth=0.03): @@ -485,44 +571,50 @@ def create_lowpass_filter(band_center=0.5, kernelLength=256, transitionBandwidth return filterKernel.astype(np.float32) + def get_early_downsample_params(sr, hop_length, fmax_t, Q, n_octaves, verbose): """Used in CQT2010 and CQT2010v2""" - - window_bandwidth = 1.5 # for hann window + + window_bandwidth = 1.5 # for hann window filter_cutoff = fmax_t * (1 + 0.5 * window_bandwidth / Q) - sr, hop_length, downsample_factor = early_downsample(sr, - hop_length, - n_octaves, - sr//2, - filter_cutoff) + sr, hop_length, downsample_factor = early_downsample( + sr, hop_length, n_octaves, sr // 2, filter_cutoff + ) if downsample_factor != 1: - if verbose==True: + if verbose == True: print("Can do early downsample, factor = ", downsample_factor) - earlydownsample=True + earlydownsample = True # print("new sr = ", sr) # print("new hop_length = ", hop_length) - early_downsample_filter = create_lowpass_filter(band_center=1/downsample_factor, - kernelLength=256, - transitionBandwidth=0.03) + early_downsample_filter = create_lowpass_filter( + band_center=1 / downsample_factor, + kernelLength=256, + transitionBandwidth=0.03, + ) early_downsample_filter = torch.tensor(early_downsample_filter)[None, None, :] else: - if verbose==True: - print("No early downsampling is required, downsample_factor = ", downsample_factor) + if verbose == True: + print( + "No early downsampling is required, downsample_factor = ", + downsample_factor, + ) early_downsample_filter = None - earlydownsample=False + earlydownsample = False return sr, hop_length, downsample_factor, early_downsample_filter, earlydownsample -def early_downsample(sr, hop_length, n_octaves, - nyquist, filter_cutoff): - '''Return new sampling rate and hop length after early dowansampling''' - downsample_count = early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves) + +def early_downsample(sr, hop_length, n_octaves, nyquist, filter_cutoff): + """Return new sampling rate and hop length after early dowansampling""" + downsample_count = early_downsample_count( + nyquist, filter_cutoff, hop_length, n_octaves + ) # print("downsample_count = ", downsample_count) - downsample_factor = 2**(downsample_count) + downsample_factor = 2 ** (downsample_count) - hop_length //= downsample_factor # Getting new hop_length - new_sr = sr / float(downsample_factor) # Getting new sampling rate + hop_length //= downsample_factor # Getting new hop_length + new_sr = sr / float(downsample_factor) # Getting new sampling rate sr = new_sr return sr, hop_length, downsample_factor @@ -532,10 +624,11 @@ def early_downsample(sr, hop_length, n_octaves, # They are used to determine the number of pre resamplings if the starting and ending frequency # are both in low frequency regions. def early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves): - '''Compute the number of early downsampling operations''' + """Compute the number of early downsampling operations""" - downsample_count1 = max(0, int(np.ceil(np.log2(0.85 * nyquist / - filter_cutoff)) - 1) - 1) + downsample_count1 = max( + 0, int(np.ceil(np.log2(0.85 * nyquist / filter_cutoff)) - 1) - 1 + ) # print("downsample_count1 = ", downsample_count1) num_twos = nextpow2(hop_length) downsample_count2 = max(0, num_twos - n_octaves + 1) @@ -543,12 +636,14 @@ def early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves): return min(downsample_count1, downsample_count2) -def early_downsample(sr, hop_length, n_octaves, - nyquist, filter_cutoff): - '''Return new sampling rate and hop length after early dowansampling''' - downsample_count = early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves) + +def early_downsample(sr, hop_length, n_octaves, nyquist, filter_cutoff): + """Return new sampling rate and hop length after early dowansampling""" + downsample_count = early_downsample_count( + nyquist, filter_cutoff, hop_length, n_octaves + ) # print("downsample_count = ", downsample_count) - downsample_factor = 2**(downsample_count) + downsample_factor = 2 ** (downsample_count) hop_length //= downsample_factor # Getting new hop_length new_sr = sr / float(downsample_factor) # Getting new sampling rate diff --git a/Installation/setup.py b/Installation/setup.py index 9e4a323..f8aa57c 100755 --- a/Installation/setup.py +++ b/Installation/setup.py @@ -5,21 +5,24 @@ with open("README.md", "r") as fh: long_description = fh.read() + def read(rel_path): here = os.path.abspath(os.path.dirname(__file__)) - with codecs.open(os.path.join(here, rel_path), 'r') as fp: - return fp.read() - + with codecs.open(os.path.join(here, rel_path), "r") as fp: + return fp.read() + + def get_version(rel_path): for line in read(rel_path).splitlines(): - if line.startswith('__version__'): + if line.startswith("__version__"): delim = '"' if '"' in line else "'" return line.split(delim)[1] else: - raise RuntimeError("Unable to find version string.") - + raise RuntimeError("Unable to find version string.") + + setuptools.setup( - name="nnAudio", # Replace with your own username + name="nnAudio", # Replace with your own username version=get_version("nnAudio/__init__.py"), author="KinWaiCheuk", author_email="u3500684@connect.hku.hk", @@ -33,11 +36,9 @@ def get_version(rel_path): "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires='>=3.6', + python_requires=">=3.6", install_requires=[ - 'scipy', + "scipy", ], - extras_require={ - 'tests': ['pytest', 'librosa'] - } + extras_require={"tests": ["pytest", "librosa"]}, ) diff --git a/Installation/tests/parameters.py b/Installation/tests/parameters.py index c8356ac..e1835f9 100644 --- a/Installation/tests/parameters.py +++ b/Installation/tests/parameters.py @@ -14,25 +14,25 @@ """ stft_parameters = [] -n_fft = [1024,2048] -hop_length = {128,512,1024} -window = ['ones', 'hann', 'hamming'] +n_fft = [1024, 2048] +hop_length = {128, 512, 1024} +window = ["ones", "hann", "hamming"] for i in n_fft: for k in window: for j in hop_length: - if j < (i/2): - stft_parameters.append((i,j,k)) -stft_parameters.append((256, None, 'hann')) + if j < (i / 2): + stft_parameters.append((i, j, k)) +stft_parameters.append((256, None, "hann")) stft_with_win_parameters = [] -n_fft = [512,1024] +n_fft = [512, 1024] win_length = [400, 900] -hop_length = {128,256} +hop_length = {128, 256} for i in n_fft: for j in win_length: - if j < i: + if j < i: for k in hop_length: - if k < (i/2): - stft_with_win_parameters.append((i,j,k)) + if k < (i / 2): + stft_with_win_parameters.append((i, j, k)) -mel_win_parameters = [(512,400), (1024, 1000)] \ No newline at end of file +mel_win_parameters = [(512, 400), (1024, 1000)] diff --git a/Installation/tests/test_cfp.py b/Installation/tests/test_cfp.py new file mode 100644 index 0000000..fd80ff6 --- /dev/null +++ b/Installation/tests/test_cfp.py @@ -0,0 +1,102 @@ +import pytest +import librosa +import torch +from scipy.signal import chirp, sweep_poly +import sys + +sys.path.insert(0, "./") + +import os + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +from nnAudio.Spectrogram import * +from parameters import * +import warnings + +gpu_idx = 0 # Choose which GPU to use + +# If GPU is avaliable, also test on GPU +if torch.cuda.is_available(): + device_args = ["cpu", f"cuda:{gpu_idx}"] +else: + warnings.warn("GPU is not avaliable, testing only on CPU") + device_args = ["cpu"] + +# librosa example audio for testing +example_y, example_sr = librosa.load(librosa.util.example_audio_file()) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cfp_original(device): + x = torch.tensor(example_y, device=device).unsqueeze(0) + + cfp_layer = Combined_Frequency_Periodicity( + fr=2, + fs=44100, + hop_length=320, + window_size=2049, + fc=80, + tc=0.001, + g=[0.24, 0.6, 1], + NumPerOct=48, + ).to(device) + X = cfp_layer(x) + ground_truth = torch.load(os.path.join(dir_path, "ground-truths/cfp_original.pt")) + + for i, j in zip(X, ground_truth): + assert torch.allclose(i.cpu(), j, 1e-3, 1e-1) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cfp_new(device): + x = torch.tensor(example_y, device=device).unsqueeze(0) + + cfp_layer = CFP( + fr=2, + fs=44100, + hop_length=320, + window_size=2049, + fc=80, + tc=0.001, + g=[0.24, 0.6, 1], + NumPerOct=48, + ).to(device) + X = cfp_layer(x) + ground_truth = torch.load(os.path.join(dir_path, "ground-truths/cfp_new.pt")) + assert torch.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-1) + + +if torch.cuda.is_available(): + x = torch.randn((4, 44100)).to( + f"cuda:{gpu_idx}" + ) # Create a batch of input for the following Data.Parallel test + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_cfp_original_Parallel(device): + cfp_layer = Combined_Frequency_Periodicity( + fr=2, + fs=44100, + hop_length=320, + window_size=2049, + fc=80, + tc=0.001, + g=[0.24, 0.6, 1], + NumPerOct=48, + ).to(device) + cfp_layer = torch.nn.DataParallel(cfp_layer) + X = cfp_layer(x) + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_cfp_new_Parallel(device): + cfp_layer = CFP( + fr=2, + fs=44100, + hop_length=320, + window_size=2049, + fc=80, + tc=0.001, + g=[0.24, 0.6, 1], + NumPerOct=48, + ).to(device) + X = cfp_layer(x.to(device)) diff --git a/Installation/tests/test_cqt.py b/Installation/tests/test_cqt.py new file mode 100644 index 0000000..a3d4465 --- /dev/null +++ b/Installation/tests/test_cqt.py @@ -0,0 +1,292 @@ +import pytest +import librosa +import torch +from scipy.signal import chirp, sweep_poly +import sys + +sys.path.insert(0, "./") + +import os + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +from nnAudio.Spectrogram import * +from parameters import * +import warnings + +gpu_idx = 0 # Choose which GPU to use + +# If GPU is avaliable, also test on GPU +if torch.cuda.is_available(): + device_args = ["cpu", f"cuda:{gpu_idx}"] +else: + warnings.warn("GPU is not avaliable, testing only on CPU") + device_args = ["cpu"] + +# librosa example audio for testing +example_y, example_sr = librosa.load(librosa.util.example_audio_file()) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_1992(device): + # Log sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="logarithmic") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT1992( + sr=fs, fmin=220, output_format="Magnitude", n_bins=80, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + + # Complex + stft = CQT1992( + sr=fs, fmin=220, output_format="Complex", n_bins=80, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + + # Phase + stft = CQT1992( + sr=fs, fmin=220, output_format="Phase", n_bins=160, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + + assert True + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_2010(device): + # Log sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="logarithmic") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT2010( + sr=fs, fmin=110, output_format="Magnitude", n_bins=160, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + + # Complex + stft = CQT2010( + sr=fs, fmin=110, output_format="Complex", n_bins=160, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + + # Phase + stft = CQT2010( + sr=fs, fmin=110, output_format="Phase", n_bins=160, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + assert True + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_1992_v2_log(device): + # Log sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="logarithmic") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Magnitude", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join(dir_path, "ground-truths/log-sweep-cqt-1992-mag-ground-truth.npy") + ) + X = torch.log(X + 1e-5) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Complex + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Complex", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/log-sweep-cqt-1992-complex-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Phase + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Phase", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/log-sweep-cqt-1992-phase-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_1992_v2_linear(device): + # Linear sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="linear") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Magnitude", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/linear-sweep-cqt-1992-mag-ground-truth.npy" + ) + ) + X = torch.log(X + 1e-5) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Complex + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Complex", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/linear-sweep-cqt-1992-complex-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Phase + stft = CQT1992v2( + sr=fs, fmin=55, output_format="Phase", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/linear-sweep-cqt-1992-phase-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_2010_v2_log(device): + # Log sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="logarithmic") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT2010v2( + sr=fs, fmin=55, output_format="Magnitude", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + X = torch.log(X + 1e-2) + # np.save(os.path.join(dir_path, "ground-truths/log-sweep-cqt-2010-mag-ground-truth", X.cpu()) + ground_truth = np.load( + os.path.join(dir_path, "ground-truths/log-sweep-cqt-2010-mag-ground-truth.npy") + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Complex + stft = CQT2010v2( + sr=fs, fmin=55, output_format="Complex", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + # np.save(os.path.join(dir_path, "ground-truths/log-sweep-cqt-2010-complex-ground-truth", X.cpu()) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/log-sweep-cqt-2010-complex-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + +@pytest.mark.parametrize("device", [*device_args]) +def test_cqt_2010_v2_linear(device): + # Linear sweep case + fs = 44100 + t = 1 + f0 = 55 + f1 = 22050 + s = np.linspace(0, t, fs * t) + x = chirp(s, f0, 1, f1, method="linear") + x = x.astype(dtype=np.float32) + + # Magnitude + stft = CQT2010v2( + sr=fs, fmin=55, output_format="Magnitude", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + X = torch.log(X + 1e-2) + # np.save(os.path.join(dir_path, "ground-truths/linear-sweep-cqt-2010-mag-ground-truth", X.cpu()) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/linear-sweep-cqt-2010-mag-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + # Complex + stft = CQT2010v2( + sr=fs, fmin=55, output_format="Complex", n_bins=207, bins_per_octave=24 + ).to(device) + X = stft(torch.tensor(x, device=device).unsqueeze(0)) + # np.save(os.path.join(dir_path, "ground-truths/linear-sweep-cqt-2010-complex-ground-truth", X.cpu()) + ground_truth = np.load( + os.path.join( + dir_path, "ground-truths/linear-sweep-cqt-2010-complex-ground-truth.npy" + ) + ) + assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) + + +if torch.cuda.is_available(): + x = torch.randn((4, 44100)).to( + f"cuda:{gpu_idx}" + ) # Create a batch of input for the following Data.Parallel test + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_CQT1992_Parallel(device): + spec_layer = CQT1992(fmin=110, n_bins=60, bins_per_octave=12).to(device) + spec_layer_parallel = torch.nn.DataParallel(spec_layer) + spec = spec_layer_parallel(x) + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_CQT1992v2_Parallel(device): + spec_layer = CQT1992v2().to(device) + spec_layer_parallel = torch.nn.DataParallel(spec_layer) + spec = spec_layer_parallel(x) + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_CQT2010_Parallel(device): + spec_layer = CQT2010().to(device) + spec_layer_parallel = torch.nn.DataParallel(spec_layer) + spec = spec_layer_parallel(x) + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) + def test_CQT2010v2_Parallel(device): + spec_layer = CQT2010v2().to(device) + spec_layer_parallel = torch.nn.DataParallel(spec_layer) + spec = spec_layer_parallel(x) diff --git a/Installation/tests/tests_stft.py b/Installation/tests/test_stft.py similarity index 73% rename from Installation/tests/tests_stft.py rename to Installation/tests/test_stft.py index 5f22e75..5ffc2a8 100644 --- a/Installation/tests/tests_stft.py +++ b/Installation/tests/test_stft.py @@ -3,20 +3,21 @@ import torch from scipy.signal import chirp, sweep_poly import sys -sys.path.insert(0, './') + +sys.path.insert(0, "./") from nnAudio.Spectrogram import * from parameters import * import warnings -gpu_idx=0 # Choose which GPU to use +gpu_idx = 0 # Choose which GPU to use # If GPU is avaliable, also test on GPU if torch.cuda.is_available(): - device_args = ['cpu', f'cuda:{gpu_idx}'] + device_args = ["cpu", f"cuda:{gpu_idx}"] else: warnings.warn("GPU is not avaliable, testing only on CPU") - device_args = ['cpu'] + device_args = ["cpu"] # librosa example audio for testing example_y, example_sr = librosa.load(librosa.util.example_audio_file()) @@ -41,7 +42,9 @@ def test_inverse2(n_fft, hop_length, window, device): @pytest.mark.parametrize("device", [*device_args]) def test_inverse(n_fft, hop_length, window, device): x = torch.tensor(example_y, device=device) - stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, iSTFT=True).to(device) + stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, iSTFT=True).to( + device + ) X = stft(x.unsqueeze(0), output_format="Complex") x_recon = stft.inverse(X, length=x.shape[0]).squeeze() assert np.allclose(x.cpu(), x_recon.cpu(), rtol=1e-3, atol=1) @@ -59,8 +62,9 @@ def test_stft_complex(n_fft, hop_length, window, device): X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Complex") X_real, X_imag = X[:, :, :, 0].squeeze(), X[:, :, :, 1].squeeze() X_librosa = librosa.stft(x, n_fft=n_fft, hop_length=hop_length, window=window) - real_diff, imag_diff = np.allclose(X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3), \ - np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3) + real_diff, imag_diff = np.allclose( + X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3 + ), np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3) assert real_diff and imag_diff @@ -72,9 +76,12 @@ def test_stft_complex_winlength(n_fft, win_length, hop_length, device): stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length).to(device) X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Complex") X_real, X_imag = X[:, :, :, 0].squeeze(), X[:, :, :, 1].squeeze() - X_librosa = librosa.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length) - real_diff, imag_diff = np.allclose(X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3), \ - np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3) + X_librosa = librosa.stft( + x, n_fft=n_fft, win_length=win_length, hop_length=hop_length + ) + real_diff, imag_diff = np.allclose( + X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3 + ), np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3) assert real_diff and imag_diff @@ -82,7 +89,9 @@ def test_stft_complex_winlength(n_fft, win_length, hop_length, device): def test_stft_magnitude(device): x = example_y stft = STFT(n_fft=2048, hop_length=512).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Magnitude").squeeze() + X = stft( + torch.tensor(x, device=device).unsqueeze(0), output_format="Magnitude" + ).squeeze() X_librosa, _ = librosa.core.magphase(librosa.stft(x, n_fft=2048, hop_length=512)) assert np.allclose(X.cpu(), X_librosa, rtol=1e-3, atol=1e-3) @@ -95,8 +104,9 @@ def test_stft_phase(device): X_real, X_imag = torch.cos(X).squeeze(), torch.sin(X).squeeze() _, X_librosa = librosa.core.magphase(librosa.stft(x, n_fft=2048, hop_length=512)) - real_diff, imag_diff = np.mean(np.abs(X_real.cpu().numpy() - X_librosa.real)), \ - np.mean(np.abs(X_imag.cpu().numpy() - X_librosa.imag)) + real_diff, imag_diff = np.mean( + np.abs(X_real.cpu().numpy() - X_librosa.real) + ), np.mean(np.abs(X_imag.cpu().numpy() - X_librosa.imag)) # I find that np.allclose is too strict for allowing phase to be similar to librosa. # Hence for phase we use average element-wise distance as the test metric. @@ -104,18 +114,28 @@ def test_stft_phase(device): if torch.cuda.is_available(): - x = torch.randn((4,44100)).to(f'cuda:{gpu_idx}') # Create a batch of input for the following Data.Parallel test - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) + x = torch.randn((4, 44100)).to( + f"cuda:{gpu_idx}" + ) # Create a batch of input for the following Data.Parallel test + + @pytest.mark.parametrize("device", [f"cuda:{gpu_idx}"]) def test_STFT_Parallel(device): - spec_layer = STFT(hop_length=512, n_fft=2048, window='hann', - freq_scale='no', - output_format='Complex').to(device) - inverse_spec_layer = iSTFT(hop_length=512, n_fft=2048, window='hann', - freq_scale='no').to(device) + spec_layer = STFT( + hop_length=512, + n_fft=2048, + window="hann", + freq_scale="no", + output_format="Complex", + ).to(device) + inverse_spec_layer = iSTFT( + hop_length=512, n_fft=2048, window="hann", freq_scale="no" + ).to(device) spec_layer_parallel = torch.nn.DataParallel(spec_layer) inverse_spec_layer_parallel = torch.nn.DataParallel(inverse_spec_layer) spec = spec_layer_parallel(x) x_recon = inverse_spec_layer_parallel(spec, onesided=True, length=x.shape[-1]) - assert np.allclose(x_recon.detach().cpu(), x.detach().cpu(), rtol=1e-3, atol=1e-3) \ No newline at end of file + assert np.allclose( + x_recon.detach().cpu(), x.detach().cpu(), rtol=1e-3, atol=1e-3 + ) diff --git a/Installation/tests/tests_cfp.py b/Installation/tests/tests_cfp.py deleted file mode 100644 index 0ae65d6..0000000 --- a/Installation/tests/tests_cfp.py +++ /dev/null @@ -1,89 +0,0 @@ -import pytest -import librosa -import torch -from scipy.signal import chirp, sweep_poly -import sys - -sys.path.insert(0, './') -from nnAudio.Spectrogram import * -from parameters import * -import warnings - -gpu_idx = 0 # Choose which GPU to use - -# If GPU is avaliable, also test on GPU -if torch.cuda.is_available(): - device_args = ['cpu', f'cuda:{gpu_idx}'] -else: - warnings.warn("GPU is not avaliable, testing only on CPU") - device_args = ['cpu'] - -# librosa example audio for testing -example_y, example_sr = librosa.load(librosa.util.example_audio_file()) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cfp_original(device): - x = torch.tensor(example_y, device=device).unsqueeze(0) - - cfp_layer = Combined_Frequency_Periodicity(fr=2, - fs=44100, - hop_length=320, - window_size=2049, - fc=80, - tc=0.001, - g=[0.24, 0.6, 1], - NumPerOct=48, ).to(device) - X = cfp_layer(x) - ground_truth = torch.load("tests/ground-truths/cfp_original.pt") - - for i, j in zip(X, ground_truth): - assert torch.allclose(i.cpu(), j, 1e-3, 1e-1) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cfp_new(device): - x = torch.tensor(example_y, device=device).unsqueeze(0) - - cfp_layer = CFP(fr=2, - fs=44100, - hop_length=320, - window_size=2049, - fc=80, - tc=0.001, - g=[0.24, 0.6, 1], - NumPerOct=48, ).to(device) - X = cfp_layer(x) - ground_truth = torch.load("tests/ground-truths/cfp_new.pt") - assert torch.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-1) - - -if torch.cuda.is_available(): - x = torch.randn((4, 44100)).to(f'cuda:{gpu_idx}') # Create a batch of input for the following Data.Parallel test - - - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_cfp_original_Parallel(device): - cfp_layer = Combined_Frequency_Periodicity(fr=2, - fs=44100, - hop_length=320, - window_size=2049, - fc=80, - tc=0.001, - g=[0.24, 0.6, 1], - NumPerOct=48, ).to(device) - cfp_layer = torch.nn.DataParallel(cfp_layer) - X = cfp_layer(x) - - - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_cfp_new_Parallel(device): - cfp_layer = CFP(fr=2, - fs=44100, - hop_length=320, - window_size=2049, - fc=80, - tc=0.001, - g=[0.24, 0.6, 1], - NumPerOct=48, ).to(device) - X = cfp_layer(x.to(device)) \ No newline at end of file diff --git a/Installation/tests/tests_cqt.py b/Installation/tests/tests_cqt.py deleted file mode 100644 index ab448a9..0000000 --- a/Installation/tests/tests_cqt.py +++ /dev/null @@ -1,232 +0,0 @@ -import pytest -import librosa -import torch -from scipy.signal import chirp, sweep_poly -import sys - -sys.path.insert(0, './') -from nnAudio.Spectrogram import * -from parameters import * -import warnings - -gpu_idx = 0 # Choose which GPU to use - -# If GPU is avaliable, also test on GPU -if torch.cuda.is_available(): - device_args = ['cpu', f'cuda:{gpu_idx}'] -else: - warnings.warn("GPU is not avaliable, testing only on CPU") - device_args = ['cpu'] - -# librosa example audio for testing -example_y, example_sr = librosa.load(librosa.util.example_audio_file()) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_1992(device): - # Log sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='logarithmic') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT1992(sr=fs, fmin=220, output_format="Magnitude", - n_bins=80, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - - # Complex - stft = CQT1992(sr=fs, fmin=220, output_format="Complex", - n_bins=80, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - - # Phase - stft = CQT1992(sr=fs, fmin=220, output_format="Phase", - n_bins=160, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - - assert True - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_2010(device): - # Log sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='logarithmic') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT2010(sr=fs, fmin=110, output_format="Magnitude", - n_bins=160, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - - # Complex - stft = CQT2010(sr=fs, fmin=110, output_format="Complex", - n_bins=160, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - - # Phase - stft = CQT2010(sr=fs, fmin=110, output_format="Phase", - n_bins=160, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - assert True - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_1992_v2_log(device): - # Log sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='logarithmic') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT1992v2(sr=fs, fmin=55, output_format="Magnitude", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-mag-ground-truth.npy") - X = torch.log(X + 1e-5) - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Complex - stft = CQT1992v2(sr=fs, fmin=55, output_format="Complex", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-complex-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Phase - stft = CQT1992v2(sr=fs, fmin=55, output_format="Phase", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-phase-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_1992_v2_linear(device): - # Linear sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='linear') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT1992v2(sr=fs, fmin=55, output_format="Magnitude", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-mag-ground-truth.npy") - X = torch.log(X + 1e-5) - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Complex - stft = CQT1992v2(sr=fs, fmin=55, output_format="Complex", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-complex-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Phase - stft = CQT1992v2(sr=fs, fmin=55, output_format="Phase", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-phase-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_2010_v2_log(device): - # Log sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='logarithmic') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT2010v2(sr=fs, fmin=55, output_format="Magnitude", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - X = torch.log(X + 1e-2) - # np.save("tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth", X.cpu()) - ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Complex - stft = CQT2010v2(sr=fs, fmin=55, output_format="Complex", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - # np.save("tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth", X.cpu()) - ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - -@pytest.mark.parametrize("device", [*device_args]) -def test_cqt_2010_v2_linear(device): - # Linear sweep case - fs = 44100 - t = 1 - f0 = 55 - f1 = 22050 - s = np.linspace(0, t, fs * t) - x = chirp(s, f0, 1, f1, method='linear') - x = x.astype(dtype=np.float32) - - # Magnitude - stft = CQT2010v2(sr=fs, fmin=55, output_format="Magnitude", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - X = torch.log(X + 1e-2) - # np.save("tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth", X.cpu()) - ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - # Complex - stft = CQT2010v2(sr=fs, fmin=55, output_format="Complex", - n_bins=207, bins_per_octave=24).to(device) - X = stft(torch.tensor(x, device=device).unsqueeze(0)) - # np.save("tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth", X.cpu()) - ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth.npy") - assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3) - - -if torch.cuda.is_available(): - x = torch.randn((4,44100)).to(f'cuda:{gpu_idx}') # Create a batch of input for the following Data.Parallel test - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_CQT1992_Parallel(device): - spec_layer = CQT1992(fmin=110, n_bins=60, bins_per_octave=12).to(device) - spec_layer_parallel = torch.nn.DataParallel(spec_layer) - spec = spec_layer_parallel(x) - - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_CQT1992v2_Parallel(device): - spec_layer = CQT1992v2().to(device) - spec_layer_parallel = torch.nn.DataParallel(spec_layer) - spec = spec_layer_parallel(x) - - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_CQT2010_Parallel(device): - spec_layer = CQT2010().to(device) - spec_layer_parallel = torch.nn.DataParallel(spec_layer) - spec = spec_layer_parallel(x) - - @pytest.mark.parametrize("device", [f'cuda:{gpu_idx}']) - def test_CQT2010v2_Parallel(device): - spec_layer = CQT2010v2().to(device) - spec_layer_parallel = torch.nn.DataParallel(spec_layer) - spec = spec_layer_parallel(x) \ No newline at end of file diff --git a/Sphinx/source/conf.py b/Sphinx/source/conf.py index a2a2aef..76ea04f 100755 --- a/Sphinx/source/conf.py +++ b/Sphinx/source/conf.py @@ -15,39 +15,42 @@ import os import codecs import sys -import sphinx_rtd_theme # This is for a nice html theme -sys.path.insert(0, '../Installation/nnAudio') -sys.path.insert(0, '../Installation/') - +import sphinx_rtd_theme # This is for a nice html theme + +sys.path.insert(0, "../Installation/nnAudio") +sys.path.insert(0, "../Installation/") + + def read(rel_path): - with codecs.open(rel_path, 'r') as fp: - return fp.read() - + with codecs.open(rel_path, "r") as fp: + return fp.read() + + def get_version(rel_path): for line in read(rel_path).splitlines(): - if line.startswith('__version__'): + if line.startswith("__version__"): delim = '"' if '"' in line else "'" return line.split(delim)[1] else: - raise RuntimeError("Unable to find version string.") + raise RuntimeError("Unable to find version string.") # -- Project information ----------------------------------------------------- -project = 'nnAudio' -copyright = '2019, Cheuk Kin Wai' -author = 'Cheuk Kin Wai' +project = "nnAudio" +copyright = "2019, Cheuk Kin Wai" +author = "Cheuk Kin Wai" # The short X.Y version version = get_version("../../Installation/nnAudio/__init__.py") # The full version, including alpha/beta/rc tags -release = f'{version}' +release = f"{version}" -#This line is for the rst files to read the version number via |ProjectVersion| +# This line is for the rst files to read the version number via |ProjectVersion| rst_epilog = """ .. |ProjectVersion| replace:: {versionnum} """.format( -versionnum = version, + versionnum=version, ) @@ -61,27 +64,30 @@ def get_version(rel_path): # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autosectionlabel', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx_rtd_theme' + "sphinx.ext.autosectionlabel", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx_rtd_theme", ] -autodoc_default_flags = ['members','undoc-members'] # Skip inherited members from PyTorch -autosummary_generate = True # To generate one class per page +autodoc_default_flags = [ + "members", + "undoc-members", +] # Skip inherited members from PyTorch +autosummary_generate = True # To generate one class per page # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -104,9 +110,9 @@ def get_version(rel_path): # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" html_logo = "logo.png" -html_title = f'{version}' +html_title = f"{version}" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -133,7 +139,7 @@ def get_version(rel_path): # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'nnAudiodoc' +htmlhelp_basename = "nnAudiodoc" # -- Options for LaTeX output ------------------------------------------------ @@ -142,15 +148,12 @@ def get_version(rel_path): # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -160,8 +163,7 @@ def get_version(rel_path): # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'nnAudio.tex', 'nnAudio Documentation', - 'Cheuk Kin Wai', 'manual'), + (master_doc, "nnAudio.tex", "nnAudio Documentation", "Cheuk Kin Wai", "manual"), ] @@ -169,10 +171,7 @@ def get_version(rel_path): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'nnaudio', 'nnAudio Documentation', - [author], 1) -] +man_pages = [(master_doc, "nnaudio", "nnAudio Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -181,9 +180,15 @@ def get_version(rel_path): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'nnAudio', 'nnAudio Documentation', - author, 'nnAudio', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "nnAudio", + "nnAudio Documentation", + author, + "nnAudio", + "One line description of project.", + "Miscellaneous", + ), ] @@ -202,7 +207,7 @@ def get_version(rel_path): # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration -------------------------------------------------