Skip to content

Commit

Permalink
Using librosa without the need of installing it
Browse files Browse the repository at this point in the history
  • Loading branch information
KinWaiCheuk committed Nov 17, 2019
1 parent 6a361de commit 561ff81
Show file tree
Hide file tree
Showing 8 changed files with 1,510 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Installation/build
Installation/dist
Bach.wav
Chopin.wav
statistic.ipynb
60 changes: 57 additions & 3 deletions Installation/nnAudio/Spectrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scipy import fft
import warnings

from librosa.filters import mel # This function is required for calculating Melspectrogram
from librosa_filters import mel # This is equalvant to from librosa.filters import mel

sz_float = 4 # size of a float
epsilon = 10e-8 # fudge factor for normalization
Expand Down Expand Up @@ -501,7 +501,6 @@ def forward(self,x_real,x_imag):
imag = a2+b1
real = a1-b2
return (real/self.n_fft, imag/self.n_fft)

class MelSpectrogram(torch.nn.Module):
def __init__(self, sr=22050, n_fft=2048, n_mels=128, hop_length=512, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False):
super(MelSpectrogram, self).__init__()
Expand Down Expand Up @@ -542,6 +541,57 @@ def forward(self,x):
spec = conv1d(x, self.wsin, stride=self.stride).pow(2) \
+ conv1d(x, self.wcos, stride=self.stride).pow(2) # Doing STFT by using conv1d

melspec = torch.matmul(self.mel_basis, spec)
return melspec


class MelSpectrogramv2(torch.nn.Module):
def __init__(self, sr=22050, n_fft=2048, n_mels=128, hop_length=512, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False):
super(MelSpectrogramv2, self).__init__()
self.stride = hop_length
self.center = center
self.pad_mode = pad_mode
self.n_fft = n_fft
self.trainable_STFT=trainable_STFT

# Create filter windows for stft
if self.trainable_STFT==True:
start = time()
wsin, wcos, self.bins2freq, _ = create_fourier_kernels(n_fft, freq_bins=None, window=window, freq_scale='no', sr=sr)
self.wsin = torch.tensor(wsin, dtype=torch.float)
self.wcos = torch.tensor(wcos, dtype=torch.float)
self.wsin = torch.nn.Parameter(self.wsin)
self.wcos = torch.nn.Parameter(self.wcos)
print("STFT filter created, time used = {:.4f} seconds".format(time()-start))
else:
window = get_window(window,int(n_fft), fftbins=True).astype(np.float32)
self.window = torch.tensor(window)
# Creating kenral for mel spectrogram
start = time()
mel_basis = mel(sr, n_fft, n_mels, fmin, fmax, htk=htk, norm=norm)
self.mel_basis = torch.tensor(mel_basis)
print("Mel filter created, time used = {:.4f} seconds".format(time()-start))

if trainable_mel==True:
self.mel_basis = torch.nn.Parameter(self.mel_basis)



def forward(self,x):
x = broadcast_dim(x)
if self.center:
if self.pad_mode == 'constant':
padding = nn.ConstantPad1d(self.n_fft//2, 0)
elif self.pad_mode == 'reflect':
padding = nn.ReflectionPad1d(self.n_fft//2)

x = padding(x)
if self.trainable_STFT==False:
spec = torch.stft(x, self.n_fft, self.stride, window=self.window)
else:
spec = conv1d(x, self.wsin, stride=self.stride).pow(2) \
+ conv1d(x, self.wcos, stride=self.stride).pow(2) # Doing STFT by using conv1d

melspec = torch.matmul(self.mel_basis, spec)
return melspec

Expand Down Expand Up @@ -905,4 +955,8 @@ def forward(self,x):
CQT = CQT*self.downsample_factor/2**(self.n_octaves-1) # Normalizing the output with the downsampling factor, 2**(self.n_octaves-1) is make it same mag as 1992

return CQT*torch.sqrt(self.lenghts.view(-1,1))



class CQT(CQT1992v2):
"""Using CQT1992v2 as the default CQT algorithm"""
pass
7 changes: 7 additions & 0 deletions Installation/nnAudio/librosa_LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ISC License

Copyright (c) 2013--2017, librosa development team.

Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
296 changes: 296 additions & 0 deletions Installation/nnAudio/librosa_filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
import numpy as np
### ----------------Functions for generating kenral for Mel Spectrogram------------ ###
# This code is equalvant to from librosa.filters import mel
# By doing so, we can run nnAudio without installing librosa

def mel_to_hz(mels, htk=False):
"""Convert mel bin numbers to frequencies
Examples
--------
>>> librosa.mel_to_hz(3)
200.
>>> librosa.mel_to_hz([1,2,3,4,5])
array([ 66.667, 133.333, 200. , 266.667, 333.333])
Parameters
----------
mels : np.ndarray [shape=(n,)], float
mel bins to convert
htk : bool
use HTK formula instead of Slaney
Returns
-------
frequencies : np.ndarray [shape=(n,)]
input mels in Hz
See Also
--------
hz_to_mel
"""

mels = np.asanyarray(mels)

if htk:
return 700.0 * (10.0**(mels / 2595.0) - 1.0)

# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels

# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region

if mels.ndim:
# If we have vector data, vectorize
log_t = (mels >= min_log_mel)
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
elif mels >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel))

return freqs

def hz_to_mel(frequencies, htk=False):
"""Convert Hz to Mels
Examples
--------
>>> librosa.hz_to_mel(60)
0.9
>>> librosa.hz_to_mel([110, 220, 440])
array([ 1.65, 3.3 , 6.6 ])
Parameters
----------
frequencies : number or np.ndarray [shape=(n,)] , float
scalar or array of frequencies
htk : bool
use HTK formula instead of Slaney
Returns
-------
mels : number or np.ndarray [shape=(n,)]
input frequencies in Mels
See Also
--------
mel_to_hz
"""

frequencies = np.asanyarray(frequencies)

if htk:
return 2595.0 * np.log10(1.0 + frequencies / 700.0)

# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3

mels = (frequencies - f_min) / f_sp

# Fill in the log-scale part

min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region

if frequencies.ndim:
# If we have array data, vectorize
log_t = (frequencies >= min_log_hz)
mels[log_t] = min_log_mel + np.log(frequencies[log_t]/min_log_hz) / logstep
elif frequencies >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep

return mels

def fft_frequencies(sr=22050, n_fft=2048):
'''Alternative implementation of `np.fft.fftfreq`
Parameters
----------
sr : number > 0 [scalar]
Audio sampling rate
n_fft : int > 0 [scalar]
FFT window size
Returns
-------
freqs : np.ndarray [shape=(1 + n_fft/2,)]
Frequencies `(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)`
Examples
--------
>>> librosa.fft_frequencies(sr=22050, n_fft=16)
array([ 0. , 1378.125, 2756.25 , 4134.375,
5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ])
'''

return np.linspace(0,
float(sr) / 2,
int(1 + n_fft//2),
endpoint=True)

def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False):
"""Compute an array of acoustic frequencies tuned to the mel scale.
The mel scale is a quasi-logarithmic function of acoustic frequency
designed such that perceptually similar pitch intervals (e.g. octaves)
appear equal in width over the full hearing range.
Because the definition of the mel scale is conditioned by a finite number
of subjective psychoaoustical experiments, several implementations coexist
in the audio signal processing literature [1]_. By default, librosa replicates
the behavior of the well-established MATLAB Auditory Toolbox of Slaney [2]_.
According to this default implementation, the conversion from Hertz to mel is
linear below 1 kHz and logarithmic above 1 kHz. Another available implementation
replicates the Hidden Markov Toolkit [3]_ (HTK) according to the following formula:
`mel = 2595.0 * np.log10(1.0 + f / 700.0).`
The choice of implementation is determined by the `htk` keyword argument: setting
`htk=False` leads to the Auditory toolbox implementation, whereas setting it `htk=True`
leads to the HTK implementation.
.. [1] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale.
In Proc. International Conference on Acoustics, Speech, and Signal Processing
(ICASSP), vol. 1, pp. 217-220, 1998.
.. [2] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory
Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998.
.. [3] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X.,
Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P.
The HTK book, version 3.4. Cambridge University, March 2009.
See Also
--------
hz_to_mel
mel_to_hz
librosa.feature.melspectrogram
librosa.feature.mfcc
Parameters
----------
n_mels : int > 0 [scalar]
Number of mel bins.
fmin : float >= 0 [scalar]
Minimum frequency (Hz).
fmax : float >= 0 [scalar]
Maximum frequency (Hz).
htk : bool
If True, use HTK formula to convert Hz to mel.
Otherwise (False), use Slaney's Auditory Toolbox.
Returns
-------
bin_frequencies : ndarray [shape=(n_mels,)]
Vector of n_mels frequencies in Hz which are uniformly spaced on the Mel
axis.
Examples
--------
>>> librosa.mel_frequencies(n_mels=40)
array([ 0. , 85.317, 170.635, 255.952,
341.269, 426.586, 511.904, 597.221,
682.538, 767.855, 853.173, 938.49 ,
1024.856, 1119.114, 1222.042, 1334.436,
1457.167, 1591.187, 1737.532, 1897.337,
2071.84 , 2262.393, 2470.47 , 2697.686,
2945.799, 3216.731, 3512.582, 3835.643,
4188.417, 4573.636, 4994.285, 5453.621,
5955.205, 6502.92 , 7101.009, 7754.107,
8467.272, 9246.028, 10096.408, 11025. ])
"""

# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)

mels = np.linspace(min_mel, max_mel, n_mels)

return mel_to_hz(mels, htk=htk)

def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
norm=1, dtype=np.float32):
"""Create a Filterbank matrix to combine FFT bins into Mel-frequency bins
Parameters
----------
sr : number > 0 [scalar]
sampling rate of the incoming signal
n_fft : int > 0 [scalar]
number of FFT components
n_mels : int > 0 [scalar]
number of Mel bands to generate
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use `fmax = sr / 2.0`
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 1, np.inf} [scalar]
if 1, divide the triangular mel weights by the width of the mel band
(area normalization). Otherwise, leave all the triangles aiming for
a peak value of 1.0
dtype : np.dtype
The data type of the output basis.
By default, uses 32-bit (single-precision) floating point.
Returns
-------
M : np.ndarray [shape=(n_mels, 1 + n_fft/2)]
Mel transform matrix
Notes
-----
This function caches at level 10.
Examples
--------
>>> melfb = librosa.filters.mel(22050, 2048)
>>> melfb
array([[ 0. , 0.016, ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ],
...,
[ 0. , 0. , ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ]])
Clip the maximum frequency to 8KHz
>>> librosa.filters.mel(22050, 2048, fmax=8000)
array([[ 0. , 0.02, ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ],
...,
[ 0. , 0. , ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ]])
>>> import matplotlib.pyplot as plt
>>> plt.figure()
>>> librosa.display.specshow(melfb, x_axis='linear')
>>> plt.ylabel('Mel filter')
>>> plt.title('Mel filter bank')
>>> plt.colorbar()
>>> plt.tight_layout()
>>> plt.show()
"""

if fmax is None:
fmax = float(sr) / 2

if norm is not None and norm != 1 and norm != np.inf:
raise ParameterError('Unsupported norm: {}'.format(repr(norm)))

# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)

# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)

# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)

fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)

for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i+2] / fdiff[i+1]

# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))

if norm == 1:
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]

# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn('Empty filters detected in mel frequency basis. '
'Some channels will produce empty responses. '
'Try increasing your sampling rate (and fmax) or '
'reducing n_mels.')

return weights
### ------------------End of Functions for generating kenral for Mel Spectrogram ----------------###
Loading

0 comments on commit 561ff81

Please sign in to comment.