forked from KinWaiCheuk/nnAudio
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Using librosa without the need of installing it
- Loading branch information
1 parent
6a361de
commit 561ff81
Showing
8 changed files
with
1,510 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ Installation/build | |
Installation/dist | ||
Bach.wav | ||
Chopin.wav | ||
statistic.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
ISC License | ||
|
||
Copyright (c) 2013--2017, librosa development team. | ||
|
||
Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,296 @@ | ||
import numpy as np | ||
### ----------------Functions for generating kenral for Mel Spectrogram------------ ### | ||
# This code is equalvant to from librosa.filters import mel | ||
# By doing so, we can run nnAudio without installing librosa | ||
|
||
def mel_to_hz(mels, htk=False): | ||
"""Convert mel bin numbers to frequencies | ||
Examples | ||
-------- | ||
>>> librosa.mel_to_hz(3) | ||
200. | ||
>>> librosa.mel_to_hz([1,2,3,4,5]) | ||
array([ 66.667, 133.333, 200. , 266.667, 333.333]) | ||
Parameters | ||
---------- | ||
mels : np.ndarray [shape=(n,)], float | ||
mel bins to convert | ||
htk : bool | ||
use HTK formula instead of Slaney | ||
Returns | ||
------- | ||
frequencies : np.ndarray [shape=(n,)] | ||
input mels in Hz | ||
See Also | ||
-------- | ||
hz_to_mel | ||
""" | ||
|
||
mels = np.asanyarray(mels) | ||
|
||
if htk: | ||
return 700.0 * (10.0**(mels / 2595.0) - 1.0) | ||
|
||
# Fill in the linear scale | ||
f_min = 0.0 | ||
f_sp = 200.0 / 3 | ||
freqs = f_min + f_sp * mels | ||
|
||
# And now the nonlinear scale | ||
min_log_hz = 1000.0 # beginning of log region (Hz) | ||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) | ||
logstep = np.log(6.4) / 27.0 # step size for log region | ||
|
||
if mels.ndim: | ||
# If we have vector data, vectorize | ||
log_t = (mels >= min_log_mel) | ||
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) | ||
elif mels >= min_log_mel: | ||
# If we have scalar data, check directly | ||
freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel)) | ||
|
||
return freqs | ||
|
||
def hz_to_mel(frequencies, htk=False): | ||
"""Convert Hz to Mels | ||
Examples | ||
-------- | ||
>>> librosa.hz_to_mel(60) | ||
0.9 | ||
>>> librosa.hz_to_mel([110, 220, 440]) | ||
array([ 1.65, 3.3 , 6.6 ]) | ||
Parameters | ||
---------- | ||
frequencies : number or np.ndarray [shape=(n,)] , float | ||
scalar or array of frequencies | ||
htk : bool | ||
use HTK formula instead of Slaney | ||
Returns | ||
------- | ||
mels : number or np.ndarray [shape=(n,)] | ||
input frequencies in Mels | ||
See Also | ||
-------- | ||
mel_to_hz | ||
""" | ||
|
||
frequencies = np.asanyarray(frequencies) | ||
|
||
if htk: | ||
return 2595.0 * np.log10(1.0 + frequencies / 700.0) | ||
|
||
# Fill in the linear part | ||
f_min = 0.0 | ||
f_sp = 200.0 / 3 | ||
|
||
mels = (frequencies - f_min) / f_sp | ||
|
||
# Fill in the log-scale part | ||
|
||
min_log_hz = 1000.0 # beginning of log region (Hz) | ||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) | ||
logstep = np.log(6.4) / 27.0 # step size for log region | ||
|
||
if frequencies.ndim: | ||
# If we have array data, vectorize | ||
log_t = (frequencies >= min_log_hz) | ||
mels[log_t] = min_log_mel + np.log(frequencies[log_t]/min_log_hz) / logstep | ||
elif frequencies >= min_log_hz: | ||
# If we have scalar data, heck directly | ||
mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep | ||
|
||
return mels | ||
|
||
def fft_frequencies(sr=22050, n_fft=2048): | ||
'''Alternative implementation of `np.fft.fftfreq` | ||
Parameters | ||
---------- | ||
sr : number > 0 [scalar] | ||
Audio sampling rate | ||
n_fft : int > 0 [scalar] | ||
FFT window size | ||
Returns | ||
------- | ||
freqs : np.ndarray [shape=(1 + n_fft/2,)] | ||
Frequencies `(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)` | ||
Examples | ||
-------- | ||
>>> librosa.fft_frequencies(sr=22050, n_fft=16) | ||
array([ 0. , 1378.125, 2756.25 , 4134.375, | ||
5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ]) | ||
''' | ||
|
||
return np.linspace(0, | ||
float(sr) / 2, | ||
int(1 + n_fft//2), | ||
endpoint=True) | ||
|
||
def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False): | ||
"""Compute an array of acoustic frequencies tuned to the mel scale. | ||
The mel scale is a quasi-logarithmic function of acoustic frequency | ||
designed such that perceptually similar pitch intervals (e.g. octaves) | ||
appear equal in width over the full hearing range. | ||
Because the definition of the mel scale is conditioned by a finite number | ||
of subjective psychoaoustical experiments, several implementations coexist | ||
in the audio signal processing literature [1]_. By default, librosa replicates | ||
the behavior of the well-established MATLAB Auditory Toolbox of Slaney [2]_. | ||
According to this default implementation, the conversion from Hertz to mel is | ||
linear below 1 kHz and logarithmic above 1 kHz. Another available implementation | ||
replicates the Hidden Markov Toolkit [3]_ (HTK) according to the following formula: | ||
`mel = 2595.0 * np.log10(1.0 + f / 700.0).` | ||
The choice of implementation is determined by the `htk` keyword argument: setting | ||
`htk=False` leads to the Auditory toolbox implementation, whereas setting it `htk=True` | ||
leads to the HTK implementation. | ||
.. [1] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale. | ||
In Proc. International Conference on Acoustics, Speech, and Signal Processing | ||
(ICASSP), vol. 1, pp. 217-220, 1998. | ||
.. [2] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory | ||
Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998. | ||
.. [3] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X., | ||
Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P. | ||
The HTK book, version 3.4. Cambridge University, March 2009. | ||
See Also | ||
-------- | ||
hz_to_mel | ||
mel_to_hz | ||
librosa.feature.melspectrogram | ||
librosa.feature.mfcc | ||
Parameters | ||
---------- | ||
n_mels : int > 0 [scalar] | ||
Number of mel bins. | ||
fmin : float >= 0 [scalar] | ||
Minimum frequency (Hz). | ||
fmax : float >= 0 [scalar] | ||
Maximum frequency (Hz). | ||
htk : bool | ||
If True, use HTK formula to convert Hz to mel. | ||
Otherwise (False), use Slaney's Auditory Toolbox. | ||
Returns | ||
------- | ||
bin_frequencies : ndarray [shape=(n_mels,)] | ||
Vector of n_mels frequencies in Hz which are uniformly spaced on the Mel | ||
axis. | ||
Examples | ||
-------- | ||
>>> librosa.mel_frequencies(n_mels=40) | ||
array([ 0. , 85.317, 170.635, 255.952, | ||
341.269, 426.586, 511.904, 597.221, | ||
682.538, 767.855, 853.173, 938.49 , | ||
1024.856, 1119.114, 1222.042, 1334.436, | ||
1457.167, 1591.187, 1737.532, 1897.337, | ||
2071.84 , 2262.393, 2470.47 , 2697.686, | ||
2945.799, 3216.731, 3512.582, 3835.643, | ||
4188.417, 4573.636, 4994.285, 5453.621, | ||
5955.205, 6502.92 , 7101.009, 7754.107, | ||
8467.272, 9246.028, 10096.408, 11025. ]) | ||
""" | ||
|
||
# 'Center freqs' of mel bands - uniformly spaced between limits | ||
min_mel = hz_to_mel(fmin, htk=htk) | ||
max_mel = hz_to_mel(fmax, htk=htk) | ||
|
||
mels = np.linspace(min_mel, max_mel, n_mels) | ||
|
||
return mel_to_hz(mels, htk=htk) | ||
|
||
def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, | ||
norm=1, dtype=np.float32): | ||
"""Create a Filterbank matrix to combine FFT bins into Mel-frequency bins | ||
Parameters | ||
---------- | ||
sr : number > 0 [scalar] | ||
sampling rate of the incoming signal | ||
n_fft : int > 0 [scalar] | ||
number of FFT components | ||
n_mels : int > 0 [scalar] | ||
number of Mel bands to generate | ||
fmin : float >= 0 [scalar] | ||
lowest frequency (in Hz) | ||
fmax : float >= 0 [scalar] | ||
highest frequency (in Hz). | ||
If `None`, use `fmax = sr / 2.0` | ||
htk : bool [scalar] | ||
use HTK formula instead of Slaney | ||
norm : {None, 1, np.inf} [scalar] | ||
if 1, divide the triangular mel weights by the width of the mel band | ||
(area normalization). Otherwise, leave all the triangles aiming for | ||
a peak value of 1.0 | ||
dtype : np.dtype | ||
The data type of the output basis. | ||
By default, uses 32-bit (single-precision) floating point. | ||
Returns | ||
------- | ||
M : np.ndarray [shape=(n_mels, 1 + n_fft/2)] | ||
Mel transform matrix | ||
Notes | ||
----- | ||
This function caches at level 10. | ||
Examples | ||
-------- | ||
>>> melfb = librosa.filters.mel(22050, 2048) | ||
>>> melfb | ||
array([[ 0. , 0.016, ..., 0. , 0. ], | ||
[ 0. , 0. , ..., 0. , 0. ], | ||
..., | ||
[ 0. , 0. , ..., 0. , 0. ], | ||
[ 0. , 0. , ..., 0. , 0. ]]) | ||
Clip the maximum frequency to 8KHz | ||
>>> librosa.filters.mel(22050, 2048, fmax=8000) | ||
array([[ 0. , 0.02, ..., 0. , 0. ], | ||
[ 0. , 0. , ..., 0. , 0. ], | ||
..., | ||
[ 0. , 0. , ..., 0. , 0. ], | ||
[ 0. , 0. , ..., 0. , 0. ]]) | ||
>>> import matplotlib.pyplot as plt | ||
>>> plt.figure() | ||
>>> librosa.display.specshow(melfb, x_axis='linear') | ||
>>> plt.ylabel('Mel filter') | ||
>>> plt.title('Mel filter bank') | ||
>>> plt.colorbar() | ||
>>> plt.tight_layout() | ||
>>> plt.show() | ||
""" | ||
|
||
if fmax is None: | ||
fmax = float(sr) / 2 | ||
|
||
if norm is not None and norm != 1 and norm != np.inf: | ||
raise ParameterError('Unsupported norm: {}'.format(repr(norm))) | ||
|
||
# Initialize the weights | ||
n_mels = int(n_mels) | ||
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) | ||
|
||
# Center freqs of each FFT bin | ||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) | ||
|
||
# 'Center freqs' of mel bands - uniformly spaced between limits | ||
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) | ||
|
||
fdiff = np.diff(mel_f) | ||
ramps = np.subtract.outer(mel_f, fftfreqs) | ||
|
||
for i in range(n_mels): | ||
# lower and upper slopes for all bins | ||
lower = -ramps[i] / fdiff[i] | ||
upper = ramps[i+2] / fdiff[i+1] | ||
|
||
# .. then intersect them with each other and zero | ||
weights[i] = np.maximum(0, np.minimum(lower, upper)) | ||
|
||
if norm == 1: | ||
# Slaney-style mel is scaled to be approx constant energy per channel | ||
enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels]) | ||
weights *= enorm[:, np.newaxis] | ||
|
||
# Only check weights if f_mel[0] is positive | ||
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): | ||
# This means we have an empty channel somewhere | ||
warnings.warn('Empty filters detected in mel frequency basis. ' | ||
'Some channels will produce empty responses. ' | ||
'Try increasing your sampling rate (and fmax) or ' | ||
'reducing n_mels.') | ||
|
||
return weights | ||
### ------------------End of Functions for generating kenral for Mel Spectrogram ----------------### |
Oops, something went wrong.