update to v0.3.0

mgrachten · Nov 19, 2021 · 744c7c5 · 744c7c5
1 parent 5f3200f
commit 744c7c5
Show file tree

Hide file tree

Showing 16 changed files with 109 additions and 85 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ statistic.ipynb
 Installation/nnAudio/Untitled.ipynb
 Installation/nnAudio/Untitled-Copy1.ipynb
 Installation/nnAudio/Orthogonal.ipynb
+Installation/nnAudio.egg-info
 
 .idea
 Debug/

diff --git a/Installation/nnAudio.egg-info/PKG-INFO b/Installation/nnAudio.egg-info/PKG-INFO
diff --git a/Installation/nnAudio.egg-info/SOURCES.txt b/Installation/nnAudio.egg-info/SOURCES.txt
diff --git a/Installation/nnAudio.egg-info/dependency_links.txt b/Installation/nnAudio.egg-info/dependency_links.txt
diff --git a/Installation/nnAudio.egg-info/top_level.txt b/Installation/nnAudio.egg-info/top_level.txt
diff --git a/Installation/nnAudio/__init__.py b/Installation/nnAudio/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.6"
+__version__ = "0.3.0"
diff --git a/Installation/nnAudio/features/gammatone.py b/Installation/nnAudio/features/gammatone.py
@@ -7,7 +7,9 @@
 
 class Gammatonegram(nn.Module):
     """
-    This function is to calculate the Gammatonegram of the input signal. Input signal should be in either of the following shapes. 1. ``(len_audio)``, 2. ``(num_audio, len_audio)``, 3. ``(num_audio, 1, len_audio)``. The correct shape will be inferred autommatically if the input follows these 3 shapes. This class inherits from ``nn.Module``, therefore, the usage is same as ``nn.Module``.
+    This function is to calculate the Gammatonegram of the input signal.
+    
+    Input signal should be in either of the following shapes. 1. ``(len_audio)``, 2. ``(num_audio, len_audio)``, 3. ``(num_audio, 1, len_audio)``. The correct shape will be inferred autommatically if the input follows these 3 shapes. This class inherits from ``nn.Module``, therefore, the usage is same as ``nn.Module``.
 
     Parameters
     ----------
@@ -50,6 +52,7 @@ class Gammatonegram(nn.Module):
     --------
     >>> spec_layer = Spectrogram.Gammatonegram()
     >>> specs = spec_layer(x)
+    
     """
 
     def __init__(
@@ -70,7 +73,7 @@ def __init__(
         trainable_STFT=False,
         verbose=True,
     ):
-        super(Gammatonegram, self).__init__()
+        super().__init__()
         self.stride = hop_length
         self.center = center
         self.pad_mode = pad_mode
@@ -97,7 +100,7 @@ def __init__(
 
             # Creating kenral for Gammatone spectrogram
         start = time()
-        gammatone_basis = gammatone(sr, n_fft, n_bins, fmin, fmax)
+        gammatone_basis = get_gammatone(sr, n_fft, n_bins, fmin, fmax)
         gammatone_basis = torch.tensor(gammatone_basis)
 
         if verbose == True:
@@ -147,3 +150,4 @@ def forward(self, x):
 
         gammatonespec = torch.matmul(self.gammatone_basis, spec)
         return gammatonespec
+
diff --git a/Installation/nnAudio/features/griffin_lim.py b/Installation/nnAudio/features/griffin_lim.py
@@ -146,3 +146,4 @@ def forward(self, S):
             center=self.center,
         )
         return inverse
+
diff --git a/Installation/nnAudio/features/mel.py b/Installation/nnAudio/features/mel.py
@@ -142,7 +142,7 @@ def __init__(
 
         # Creating kernel for mel spectrogram
         start = time()
-        mel_basis = mel(sr, n_fft, n_mels, fmin, fmax, htk=htk, norm=norm)
+        mel_basis = get_mel(sr, n_fft, n_mels, fmin, fmax, htk=htk, norm=norm)
         mel_basis = torch.tensor(mel_basis)
 
         if verbose == True:

diff --git a/Installation/nnAudio/librosa_functions.py b/Installation/nnAudio/librosa_functions.py
@@ -150,7 +150,7 @@ def fft2gammatonemx(
     return wts, cfreqs
 
 
-def gammatone(
+def get_gammatone(
     sr, n_fft, n_bins=64, fmin=20.0, fmax=None, htk=False, norm=1, dtype=np.float32
 ):
     """Create a Filterbank matrix to combine FFT bins into Gammatone bins
@@ -372,7 +372,7 @@ def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False):
     return mel_to_hz(mels, htk=htk)
 
 
-def mel(
+def get_mel(
     sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1, dtype=np.float32
 ):
     """

diff --git a/Sphinx/source/citing.rst b/Sphinx/source/citing.rst
@@ -4,13 +4,13 @@ Citing nnAudio
 If you use nnAudio in your research, please feel free to cite our work.
 
 Plain Text
------------
+**********
 K. W. Cheuk, H. Anderson, K. Agres and D. Herremans, 
 "nnAudio: An on-the-Fly GPU Audio to Spectrogram Conversion Toolbox Using 1D Convolutional Neural Networks," 
 in IEEE Access, vol. 8, pp. 161981-162003, 2020, doi: 10.1109/ACCESS.2020.3019084.
 
 BibTex
--------
+**********
 
 .. code-block:: tex
 
@@ -25,7 +25,7 @@ BibTex
     doi={10.1109/ACCESS.2020.3019084}}
 
 Link to the paper
------------------
+******************
 
 The paper for nnAudio is avaliable on `IEEE Access <https://ieeexplore.ieee.org/document/9174990>`__
 

diff --git a/Sphinx/source/examples.rst b/Sphinx/source/examples.rst
@@ -1,27 +1,7 @@
-Tutorials
-=============
 
-Call for Contribution:
-**********************
-
-
-nnAudio is a fast-growing package. With the increasing number of feature requests, we welcome anyone who is familiar with digital signal processing and neural network to contribute to nnAudio. The current list of pending features includes:
-
-1. Invertible Constant Q Transform (CQT)
-2. CQT with filter scale factor (see issue `#54 <https://github.com/KinWaiCheuk/nnAudio/issues/54>`__)
-3. Variable Q Transform see `VQT <https://www.researchgate.net/publication/274009051_A_Matlab_Toolbox_for_Efficient_Perfect_Reconstruction_Time-Frequency_Transforms_with_Log-Frequency_Resolution>`__)
-4. Speed and Performance improvements for Griffin-Lim (see issue `#41 <https://github.com/KinWaiCheuk/nnAudio/issues/41>`__)
-5. Data Augmentation (see issue `#49 <https://github.com/KinWaiCheuk/nnAudio/issues/49>`__)
-
-(Quick tips for unit test: `cd` inside Installation folder, then type `pytest`. You need at least 1931 MiB GPU memory to pass all the unit tests)
-
-Alternatively, you may also contribute by:
-
-1. Refactoring the code structure (Now all functions are within the same file, but with the increasing number of features, I think we need to break it down into smaller modules)
-2. Making a better demonstration code or tutorial
-
-People who are interested in contributing to nnAudio can visit
-the `github page <https://github.com/KinWaiCheuk/nnAudio>`_ or 
-contact me via kinwai<underscore>cheuk<at>mymail.sutd.edu.sg.
+`PyTorch Template for Audio projects <https://github.com/KinWaiCheuk/pytorch_template>`_
+===========================================================================================
 
+I am building a pytorch template which allows audio related projects to be quick setup.
+This `template <https://github.com/KinWaiCheuk/pytorch_template>`_ uses nnAudio to extract spectrograms on-the-fly.
 
diff --git a/Sphinx/source/github.rst b/Sphinx/source/github.rst
@@ -0,0 +1,29 @@
+Source Code
+===========
+The source code for nnAudio is available at `github <https://github.com/KinWaiCheuk/nnAudio>`_.
+
+
+Call for Contribution
+======================
+
+
+nnAudio is a fast-growing package. With the increasing number of feature requests, we welcome anyone who is familiar with digital signal processing and neural network to contribute to nnAudio. The current list of pending features includes:
+
+1. Invertible Constant Q Transform (CQT)
+2. CQT with filter scale factor (see issue `#54 <https://github.com/KinWaiCheuk/nnAudio/issues/54>`__)
+3. Variable Q Transform see `VQT <https://www.researchgate.net/publication/274009051_A_Matlab_Toolbox_for_Efficient_Perfect_Reconstruction_Time-Frequency_Transforms_with_Log-Frequency_Resolution>`__)
+4. Speed and Performance improvements for Griffin-Lim (see issue `#41 <https://github.com/KinWaiCheuk/nnAudio/issues/41>`__)
+5. Data Augmentation (see issue `#49 <https://github.com/KinWaiCheuk/nnAudio/issues/49>`__)
+
+(Quick tips for unit test: `cd` inside Installation folder, then type `pytest`. You need at least 1931 MiB GPU memory to pass all the unit tests)
+
+Alternatively, you may also contribute by:
+
+1. Refactoring the code structure (Now all functions are within the same file, but with the increasing number of features, I think we need to break it down into smaller modules)
+2. Making a better demonstration code or tutorial
+
+People who are interested in contributing to nnAudio can visit
+the `github page <https://github.com/KinWaiCheuk/nnAudio>`_ or 
+contact me via kinwai<underscore>cheuk<at>mymail.sutd.edu.sg.
+
+
diff --git a/Sphinx/source/index.rst b/Sphinx/source/index.rst
@@ -5,9 +5,27 @@
 
 nnAudio |ProjectVersion|
 ===================================
-Welcome to nnAudio |ProjectVersion|. This new version changes the syntax of the spectrogram layers creation, 
-such that ``stft_layer.to(device)`` can be used. This new version is more stable 
-than the previous version since it is more compatible with other torch modules.
+Welcome to nnAudio |ProjectVersion|. A big shout out to `Miguel Pérez <https://github.com/migperfer>`_ who made this new update possible. Please feel free to check out his `github repositories <https://github.com/migperfer>`_ too.
+
+This new version restructured the coding style, making things more modular and pythonic. In terms of functionalities, everything remains the same. In the future releases, ``nnAudio.Spectrogram`` will be replaced by ``nnAudio.features`` (see also :func:`~nnAudio.features`.)
+
+Quick Start
+***********
+.. code-block:: python
+    :emphasize-lines: 1,8-10,12
+
+    from nnAudio import features
+    from scipy.io import wavfile
+    import torch
+    sr, song = wavfile.read('./Bach.wav') # Loading your audio
+    x = song.mean(1) # Converting Stereo  to Mono
+    x = torch.tensor(x, device='cuda:0').float() # casting the array into a PyTorch Tensor
+
+    spec_layer = features.STFT(n_fft=2048, freq_bins=None, hop_length=512, 
+                                  window='hann', freq_scale='linear', center=True, pad_mode='reflect', 
+                                  fmin=50,fmax=11025, sr=sr) # Initializing the model
+
+    spec = spec_layer(x) # Feed-forward your waveform to get the spectrogram      
 
 nnAudio is an audio processing toolbox using PyTorch convolutional neural
 network as its backend. By doing so, spectrograms can be generated from
@@ -49,9 +67,16 @@ The source code for **nnAudio** can be found in `GitHub <https://github.com/KinW
 
 .. toctree::
     :maxdepth: 1
-    :caption: Tutorials
+    :caption: Examples
 
     examples
+
+
+.. toctree::
+    :maxdepth: 1
+    :caption: GitHub
+
+    github
 
 
 .. toctree::

diff --git a/Sphinx/source/intro.rst b/Sphinx/source/intro.rst
@@ -58,14 +58,14 @@ The input shape should be `(batch, len_audio)`.
 
 .. code-block:: python
 
-    from nnAudio import Spectrogram
+    from nnAudio import features
     from scipy.io import wavfile
     import torch
     sr, song = wavfile.read('./Bach.wav') # Loading your audio
     x = song.mean(1) # Converting Stereo  to Mono
     x = torch.tensor(x, device='cuda:0').float() # casting the array into a PyTorch Tensor
 
-    spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512, 
+    spec_layer = features.STFT(n_fft=2048, freq_bins=None, hop_length=512, 
                                   window='hann', freq_scale='linear', center=True, pad_mode='reflect', 
                                   fmin=50,fmax=11025, sr=sr) # Initializing the model
 
@@ -76,22 +76,27 @@ The input shape should be `(batch, len_audio)`.
 
 On-the-fly audio processing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-One application for nnAudio is on-the-fly spectrogram generation when integrating it inside your neural network
+By integrating nnAudio inside your neural network, it can be used as on-the-fly spectrogram extracting. Here is one example on how to put nnAudio inside your neural network (highlighted in yellow):
 
 .. code-block:: python
-    :emphasize-lines: 5-10,27
+    :emphasize-lines: 10-15,32
+    
+    from nnAudio import features
+    import torch
+    import torch.nn as nn    
     
     class Model(torch.nn.Module):
-        def __init__(self):
-            super(Model, self).__init__()
+        def __init__(self, n_fft, output_dim):
+            super().__init__()
+            self.epsilon=1e-10
             # Getting Mel Spectrogram on the fly
-            self.spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, 
+            self.spec_layer = features.STFT(n_fft=n_fft, freq_bins=None,
                                                hop_length=512, window='hann',
-                                               freq_scale='no', center=True, 
+                                               freq_scale='no', center=True,
                                                pad_mode='reflect', fmin=50,
                                                fmax=6000, sr=22050, trainable=False,
                                                output_format='Magnitude')
-            self.n_bins = freq_bins         
+            self.n_bins = n_fft//2
 
             # Creating CNN Layers
             self.CNN_freq_kernel_size=(128,1)
@@ -101,19 +106,27 @@ One application for nnAudio is on-the-fly spectrogram generation when integratin
             self.CNN_freq = nn.Conv2d(1,k_out,
                                     kernel_size=self.CNN_freq_kernel_size,stride=self.CNN_freq_kernel_stride)
             self.CNN_time = nn.Conv2d(k_out,k2_out,
-                                    kernel_size=(1,regions),stride=(1,1))    
+                                    kernel_size=(1,3),stride=(1,1))
 
             self.region_v = 1 + (self.n_bins-self.CNN_freq_kernel_size[0])//self.CNN_freq_kernel_stride[0]
-            self.linear = torch.nn.Linear(k2_out*self.region_v, m, bias=False)
+            self.linear = torch.nn.Linear(k2_out*self.region_v, output_dim, bias=False)
 
         def forward(self,x):
             z = self.spec_layer(x)
-            z = torch.log(z+epsilon)
+            z = torch.log(z+self.epsilon)
             z2 = torch.relu(self.CNN_freq(z.unsqueeze(1)))
-            z3 = torch.relu(self.CNN_time(z2))
+            z3 = torch.relu(self.CNN_time(z2)).mean(-1)
             y = self.linear(torch.relu(torch.flatten(z3,1)))
             return torch.sigmoid(y)
             
+After that, your model can take waveforms directly as the input, and extract spectrograms on-the-fly during feedforward.
+
+.. code-block:: python
+    :emphasize-lines: 2
+
+    waveforms = torch.randn(4,44100)
+    model(waveforms) # automatically convert waveforms into spectrograms
+            
             
 Using GPU
 ~~~~~~~~~
@@ -124,9 +137,9 @@ to transfer the spectrogram layer to any device you like.
 
 .. code-block:: python
 
-    spec_layer = Spectrogram.STFT().to(device)
+    spec_layer = features.STFT().to(device)
     
-Alternatively, if your ``Spectrogram`` module is used inside your PyTorch model 
+Alternatively, if your ``features`` module is used inside your PyTorch model 
 as in the :ref:`on-the-fly processing section<on-the-fly>`, then you just need 
 to simply do ``net.to(device)``, where ``net = Model()``.
 
@@ -149,7 +162,7 @@ The speed test is conducted using three different machines, and it shows that nn
 Trainable kernals
 *****************
 
-Fourier basis in :func:`~nnAudio.Spectrogram.STFT` can be set trainable by using ``trainable=True`` argument. Fourier basis in :func:`~nnAudio.Spectrogram.MelSpectrogram` can be also set trainable by using `trainable_STFT=True`, and Mel filter banks can be set trainable using ``trainable_mel=False`` argument. The same goes for :func:`~nnAudio.Spectrogram.CQT`.
+Fourier basis in :func:`~nnAudio.features.stft.STFT` can be set trainable by using ``trainable=True`` argument. Fourier basis in :func:`~nnAudio.features.mel.MelSpectrogram` can be also set trainable by using `trainable_STFT=True`, and Mel filter banks can be set trainable using ``trainable_mel=False`` argument. The same goes for :func:`~nnAudio.features.cqt.CQT`.
 
 The follow demonstrations are avaliable on Google colab.
 
@@ -186,7 +199,7 @@ The default CQT in nnAudio is the ``CQT1992v2`` version.
 For more detail, please refer to our `paper <https://ieeexplore.ieee.org/document/9174990>`__
 
 All versions of CQT are available for users to choose.
-To explicitly choose which CQT to use, you can refer to the :ref:`CQT API section<nnAudio.Spectrogram.CQT>`.
+To explicitly choose which CQT to use, you can refer to the :ref:`CQT API section<nnAudio.features.cqt.CQT>`.
 
 
 .. image:: ../../figures/CQT_compare.png

diff --git a/Sphinx/source/nnAudio.rst b/Sphinx/source/nnAudio.rst
@@ -1,11 +1,10 @@
-
 .. automodule:: nnAudio
 
 .. autosummary::
    :toctree: _autosummary
    :template: custom-module-template.rst
    :recursive:
 
-   nnAudio.Spectrogram
+   nnAudio.features
    nnAudio.librosa_functions
    nnAudio.utils
Original file line number	Diff line number	Diff line change
Expand Up		@@ -146,3 +146,4 @@ def forward(self, S):
		center=self.center,
		)
		return inverse