added SI-SDR

benjohn18 · Nov 21, 2019 · 702d46b · 702d46b
1 parent 81e9759
commit 702d46b
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -28,29 +28,47 @@ pip install git+https://github.com/aliutkus/speechmetrics#egg=speechmetrics[gpu]
 
 `speechmetrics` has been designed to be easily used in a modular way. All you need to do is to specify the actual metrics you want to use and it will load them.
 
-This behaviour is encapsulated in the `load` function from the root of the package, that takes two arguments:
-* metrics: str or list of str
-  the available metrics that match this argument will be automatically loaded. This matching is relative to the structure of the speechmetrics package.
-  For instance:
-    - 'absolute' will match all absolute metrics
-    - 'absolute.srmr' or 'srmr' will only match SRMR
-    - '' will match all
-* window: float or None
-  gives the length in seconds of the windows on which to compute the actual scores. If None, the whole signals will be considered.
+The process is to:
+1. Load the metrics you want with the `load` function from the root of the package, that takes two arguments:
+    * metrics: str or list of str
+      the available metrics that match this argument will be automatically loaded. This matching is relative to the structure of the speechmetrics package.
+      For instance:
+        - 'absolute' will match all absolute metrics
+        - 'absolute.srmr' or 'srmr' will only match SRMR
+        - '' will match all
+    * window: float or None
+      gives the length in seconds of the windows on which to compute the actual scores. If None, the whole signals will be considered.  
+    ```my_metrics = speechmetrics.load('relative', window=5)```
+
+2. Just call the object returned by `load` with your estimated file (and your reference in case of relative metrics.)  
+   ```scores = my_metrics(path_to_estimate, path_to_reference)```
+> __WARNING__: The convention for relative metrics is to provide __estimate first, and reference second__.  
+>  This is the opposite as the general convention.  
+>     $\Rightarrow$ The advantage is: you can still call absolute metrics with the same code, they will just ignore the reference.  
 
 ## Example
 ```
+# the case of absolute metrics
 import speechmetrics
 window_length = 5 # seconds
 metrics = speechmetrics.load('absolute', window_length)
-
 scores = metrics(path_to_audio_file)
+
+# the case of relative metrics
+metrics = speechmetrics.load(['bsseval', 'sisdr'], window_length)
+scores = metrics(path_to_estimate_file, path_to_reference)
+
+# mixed case, still works
+metrics = speechmetrics.load(['bsseval', 'mosnet'], window_length)
+scores = metrics(path_to_estimate_file, path_to_reference)
+
 ```
 
 # Available metrics
-## Absolute metrics
 
-### MOSNet
+## Absolute metrics (`absolute`)
+
+### MOSNet (`absolute.mosnet` or `mosnet`)
 
 As provided by the authors of [MOSNet: Deep Learning based Objective Assessment for Voice Conversion](https://arxiv.org/abs/1904.08352). Original github [here](https://github.com/lochenchou/MOSNet)
 > @article{lo2019mosnet,  
@@ -60,7 +78,7 @@ As provided by the authors of [MOSNet: Deep Learning based Objective Assessment
   year={2019}
 }
 
-### SRMR
+### SRMR (`absolute.srmr` or `srmr`)
 
 As provided by the [SRMR Toolbox](https://github.com/jfsantos/SRMRpy), implemented by [@jfsantos](https://github.com/jfsantos).
 
@@ -74,7 +92,7 @@ As provided by the [SRMR Toolbox](https://github.com/jfsantos/SRMRpy), implement
   year={2010},  
 }
 
-* > @inproceedings{santos2014updated,
+* > @inproceedings{santos2014updated,  
   title={An updated objective intelligibility   estimation metric for normal hearing listeners under noise and reverberation},  
   author={Santos, Joo F and Senoussaoui, Mohammed and Falk, Tiago H},  
   booktitle={Proc. Int. Workshop Acoust. Signal Enhancement},  
@@ -92,8 +110,9 @@ As provided by the [SRMR Toolbox](https://github.com/jfsantos/SRMRpy), implement
   year={2014},  
 }
 
-## Relative metrics
-### BSSEval
+## Relative metrics (`relative`)
+
+### BSSEval (`relative.bsseval` or `bsseval`)
 
 As presented in [this](https://hal-lirmm.ccsd.cnrs.fr/lirmm-01766791v2/document) paper and freely available in [the official museval page](https://github.com/sigsep/sigsep-mus-eval), corresponds to BSSEval v4. There are 3 submetrics handled here: SDR, SAR, ISR.
 
@@ -106,11 +125,11 @@ As presented in [this](https://hal-lirmm.ccsd.cnrs.fr/lirmm-01766791v2/document)
   pages="293--305"  
 }
 
-### PESQ
+### PESQ (`relative.pesq` or `pesq`)
 
 As implemented [there](https://github.com/vBaiCai/python-pesq) by [@vBaiCai](https://github.com/vBaiCai).
 
-### STOI
+### STOI (`relative.stoi` or `stoi`)
 
 As implemented by [@mpariente]() [here](https://github.com/mpariente/pystoi)
 * > @inproceedings{taal2010short,  
@@ -141,3 +160,18 @@ As implemented by [@mpariente]() [here](https://github.com/mpariente/pystoi)
   year={2016},  
   publisher={IEEE}  
 }
+
+### SISDR: Shift-invariant SDR (`relative.sisdr` or `sisdr`) 
+
+As described in the following paper and implemented by @Jonathan-LeRoux [here](https://github.com/sigsep/bsseval/issues/3#issuecomment-494995846)
+* > @article{Roux_2019,  
+   title={SDR – Half-baked or Well Done?},  
+   ISBN={9781479981311},  
+   url={http://dx.doi.org/10.1109/ICASSP.2019.8683855},  
+   DOI={10.1109/icassp.2019.8683855},  
+   journal={ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},  
+   publisher={IEEE},  
+   author={Roux, Jonathan Le and Wisdom, Scott and Erdogan, Hakan and Hershey, John R.},  
+   year={2019},  
+   month={May}  
+}
diff --git a/examples/data/m2_script1_clean.wav b/examples/data/m2_script1_clean.wav
diff --git a/examples/data/m2_script1_ipad_confroom1.wav b/examples/data/m2_script1_ipad_confroom1.wav
diff --git a/examples/data/m2_script1_produced.wav b/examples/data/m2_script1_produced.wav
diff --git a/examples/test.py b/examples/test.py
@@ -3,6 +3,7 @@
 if __name__ == '__main__':
     window = 5
 
+    print('Trying ABSOLUTE metrics: ')
     metrics = sm.load('absolute', window)
 
     reference = 'data/m2_script1_produced.wav'
@@ -13,5 +14,20 @@
     for test in tests:
         import pprint
         print('Computing scores for ', test)
-        scores = metrics(test, reference)
+        scores = metrics(reference, test)
+        pprint.pprint(scores)
+
+    print('\nTrying RELATIVE metrics: ')
+
+    metrics = sm.load('relative', window)
+
+    reference = 'data/m2_script1_produced.wav'
+    tests = ['data/m2_script1_clean.wav',
+             'data/m2_script1_ipad_confroom1.wav',
+             'data/m2_script1_produced.wav']
+
+    for test in tests:
+        import pprint
+        print('Computing scores for ', test)
+        scores = metrics(reference, test)
         pprint.pprint(scores)
diff --git a/speechmetrics/relative/bsseval.py b/speechmetrics/relative/bsseval.py
@@ -11,8 +11,8 @@ def test_window(self, audios, rate):
         from museval.metrics import bss_eval
         if len(audios) != 2:
             raise ValueError('BSSEval needs a reference and a test signals.')
-        result = bss_eval(reference_sources=audios[0].T,
-                        estimated_sources=audios[1].T,
+        result = bss_eval(reference_sources=audios[1].T,
+                        estimated_sources=audios[0].T,
                         window=self.bss_window * rate,
                         hop=self.bss_hop * rate)
         return {'sdr': result[0], 'isr': result[1], 'sar': result[3]}

diff --git a/speechmetrics/relative/pesq.py b/speechmetrics/relative/pesq.py
@@ -11,7 +11,7 @@ def test_window(self, audios, rate):
         from pypesq import pesq
         if len(audios) != 2:
             raise ValueError('PESQ needs a reference and a test signals.')
-        return {'pesq': pesq(audios[0], audios[1], rate)}
+        return {'pesq': pesq(audios[1], audios[0], rate)}
 
 
 def load(window, hop=None):

diff --git a/speechmetrics/relative/sisdr.py b/speechmetrics/relative/sisdr.py
@@ -0,0 +1,32 @@
+from .. import Metric
+import numpy as np
+from numpy.linalg import norm
+
+
+class SISDR(Metric):
+    def __init__(self, window, hop=None):
+        super(SISDR, self).__init__(name='SISDR', window=window, hop=hop)
+        self.mono = True
+
+    def test_window(self, audios, rate):
+        # as provided by @Jonathan-LeRoux and slightly adapted for the case of just one reference
+        # and one estimate.
+        # see original code here: https://github.com/sigsep/bsseval/issues/3#issuecomment-494995846
+        eps = np.finfo(audios[0].dtype).eps
+        reference = audios[1].reshape(audios[1].shape[0], -1)
+        estimate = audios[0].reshape(audios[0].shape[0], -1)
+        Rss = np.dot(reference.T, reference)
+
+        # get the scaling factor for clean sources
+        a = (eps + np.dot(reference.T, estimate)) / (Rss + eps)
+
+        e_true = a * reference
+        e_res = estimate - e_true
+
+        Sss = (e_true**2).sum()
+        Snn = (e_res**2).sum()
+
+        return {'sisdr': 10 * np.log10((eps+ Sss)/(eps + Snn))}
+
+def load(window, hop=None):
+    return SISDR(window, hop)
diff --git a/speechmetrics/relative/stoi.py b/speechmetrics/relative/stoi.py
@@ -11,7 +11,7 @@ def test_window(self, audios, rate):
         if len(audios) != 2:
             raise ValueError('STOI needs a reference and a test signals.')
 
-        return {'stoi':stoi(audios[0], audios[1], rate, extended=False)}
+        return {'stoi':stoi(audios[1], audios[0], rate, extended=False)}
 
 
 def load(window, hop=None):