Added DTW_MARGIN for each level. Added 48000 Hz parameters to ffmpeg …

…wrapper. Minor docs improvements.
oktyabrenok · Nov 26, 2016 · 6fa3a7e · 6fa3a7e
1 parent 7d1a3a8
commit 6fa3a7e
Show file tree

Hide file tree

Showing 9 changed files with 106 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 **aeneas** is a Python/C library and a set of tools to automagically synchronize audio and text (aka forced alignment).
 
 * Version: 1.7.0
-* Date: 2016-12-??
+* Date: 2016-12-07
 * Developed by: [ReadBeyond](http://www.readbeyond.it/)
 * Lead Developer: [Alberto Pettarin](http://www.albertopettarin.it/)
 * License: the GNU Affero General Public License Version 3 (AGPL v3)
@@ -238,7 +238,7 @@ which explains how to use the built-in command line tools.
 * Batch processing of multiple audio/text pairs
 * Download audio from a YouTube video
 * In multilevel mode, recursive alignment from paragraph to sentence to word level
-* In multilevel mode, time resolution and/or TTS engine can be specified for each level independently
+* In multilevel mode, MFCC resolution, MFCC masking, DTW margin, and TTS engine can be specified for each level independently
 * Robust against misspelled/mispronounced words, local rearrangements of words, background noise/sporadic spikes
 * Adjustable splitting times, including a max character/second constraint for CC applications
 * Automated detection of audio head/tail

diff --git a/README.rst b/README.rst
@@ -5,7 +5,7 @@ aeneas
 synchronize audio and text (aka forced alignment).
 
 -  Version: 1.7.0
--  Date: 2016-12-??
+-  Date: 2016-12-07
 -  Developed by: `ReadBeyond <http://www.readbeyond.it/>`__
 -  Lead Developer: `Alberto Pettarin <http://www.albertopettarin.it/>`__
 -  License: the GNU Affero General Public License Version 3 (AGPL v3)
@@ -256,8 +256,8 @@ Supported Features
 -  Download audio from a YouTube video
 -  In multilevel mode, recursive alignment from paragraph to sentence to
    word level
--  In multilevel mode, time resolution and/or TTS engine can be
-   specified for each level independently
+-  In multilevel mode, MFCC resolution, MFCC masking, DTW margin, and
+   TTS engine can be specified for each level independently
 -  Robust against misspelled/mispronounced words, local rearrangements
    of words, background noise/sporadic spikes
 -  Adjustable splitting times, including a max character/second

diff --git a/aeneas/dtw.py b/aeneas/dtw.py
@@ -284,7 +284,7 @@ def _setup_dtw(self):
 
         # setup
         algorithm = self.rconf[RuntimeConfiguration.DTW_ALGORITHM]
-        delta = int(2 * self.rconf[RuntimeConfiguration.DTW_MARGIN] / self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT])
+        delta = int(2 * self.rconf.dtw_margin / self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT])
         mfcc2_length = self.synt_wave_mfcc.middle_length
         self.log([u"Requested algorithm: '%s'", algorithm])
         self.log([u"delta = %d", delta])

diff --git a/aeneas/ffmpegwrapper.py b/aeneas/ffmpegwrapper.py
@@ -72,6 +72,9 @@ class FFMPEGWrapper(Loggable):
     FFMPEG_SAMPLE_44100 = ["-ar", "44100"]
     """ Single parameter for ``ffmpeg``: 44100 Hz sampling """
 
+    FFMPEG_SAMPLE_48000 = ["-ar", "48000"]
+    """ Single parameter for ``ffmpeg``: 48000 Hz sampling """
+
     FFMPEG_MONO = ["-ac", "1"]
     """ Single parameter for ``ffmpeg``: mono (1 channel) """
 
@@ -134,11 +137,21 @@ class FFMPEGWrapper(Loggable):
     )
     """ Set of parameters for ``ffmpeg`` with 44100 Hz sampling """
 
+    FFMPEG_PARAMETERS_SAMPLE_48000 = (
+        FFMPEG_MONO +
+        FFMPEG_SAMPLE_48000 +
+        FFMPEG_OVERWRITE +
+        FFMPEG_PLAIN_HEADER +
+        FFMPEG_FORMAT_WAVE
+    )
+    """ Set of parameters for ``ffmpeg`` with 48000 Hz sampling """
+
     FFMPEG_PARAMETERS_MAP = {
         8000: FFMPEG_PARAMETERS_SAMPLE_8000,
         16000: FFMPEG_PARAMETERS_SAMPLE_16000,
         22050: FFMPEG_PARAMETERS_SAMPLE_22050,
-        44100: FFMPEG_PARAMETERS_SAMPLE_44100
+        44100: FFMPEG_PARAMETERS_SAMPLE_44100,
+        48000: FFMPEG_PARAMETERS_SAMPLE_48000
     }
     """ Map sample rate to parameter list """
 

diff --git a/aeneas/globalconstants.py b/aeneas/globalconstants.py
@@ -355,9 +355,10 @@
 
 Example::
 
+    task_language=eng
     task_language=eng-GBR
     task_language=eng-USA
-    task_language=ita-ITA
+    task_language=ita
 
 """
 

diff --git a/aeneas/runtimeconfiguration.py b/aeneas/runtimeconfiguration.py
@@ -180,12 +180,45 @@ class RuntimeConfiguration(Configuration):
     """
     DTW aligner margin, in seconds, for the ``stripe`` algorithm.
 
-    Default: ``60``, corresponding to ``60s`` ahead and behind
-    (i.e., ``120s`` total margin).
+    Default: ``60``, corresponding to ``60 s`` ahead and behind
+    (i.e., ``120 s`` total margin).
 
     .. versionadded:: 1.4.1
     """
 
+    DTW_MARGIN_L1 = "dtw_margin_l1"
+    """
+    DTW aligner margin, in seconds, for the ``stripe`` algorithm
+    at level 1 (paragraph).
+
+    Default: ``60``, corresponding to ``60 s`` ahead and behind
+    (i.e., ``120 s`` total margin).
+
+    .. versionadded:: 1.7.0
+    """
+
+    DTW_MARGIN_L2 = "dtw_margin_l2"
+    """
+    DTW aligner margin, in seconds, for the ``stripe`` algorithm
+    at level 2 (sentence).
+
+    Default: ``30``, corresponding to ``30 s`` ahead and behind
+    (i.e., ``60 s`` total margin).
+
+    .. versionadded:: 1.7.0
+    """
+
+    DTW_MARGIN_L3 = "dtw_margin_l3"
+    """
+    DTW aligner margin, in seconds, for the ``stripe`` algorithm
+    at level 3 (word).
+
+    Default: ``10``, corresponding to ``10 s`` ahead and behind
+    (i.e., ``20s`` total margin).
+
+    .. versionadded:: 1.7.0
+    """
+
     FFMPEG_PATH = "ffmpeg_path"
     """
     Path to the ``ffmpeg`` executable.
@@ -420,12 +453,28 @@ class RuntimeConfiguration(Configuration):
     """
 
     MFCC_GRANULARITY_MAP = {
-        1: (MFCC_MASK_NONSPEECH_L1, MFCC_WINDOW_LENGTH_L1, MFCC_WINDOW_SHIFT_L1),
-        2: (MFCC_MASK_NONSPEECH_L2, MFCC_WINDOW_LENGTH_L2, MFCC_WINDOW_SHIFT_L2),
-        3: (MFCC_MASK_NONSPEECH_L3, MFCC_WINDOW_LENGTH_L3, MFCC_WINDOW_SHIFT_L3),
+        1: (
+            DTW_MARGIN_L1,
+            MFCC_MASK_NONSPEECH_L1,
+            MFCC_WINDOW_LENGTH_L1,
+            MFCC_WINDOW_SHIFT_L1
+        ),
+        2: (
+            DTW_MARGIN_L2,
+            MFCC_MASK_NONSPEECH_L2,
+            MFCC_WINDOW_LENGTH_L2,
+            MFCC_WINDOW_SHIFT_L2
+        ),
+        3: (
+            DTW_MARGIN_L3,
+            MFCC_MASK_NONSPEECH_L3,
+            MFCC_WINDOW_LENGTH_L3,
+            MFCC_WINDOW_SHIFT_L3
+        ),
     }
     """
     Map level numbers to
+    ``DTW_MARGIN_*``,
     ``MFCC_MASK_NONSPEECH_*``,
     ``MFCC_WINDOW_LENGTH_*``,
     and ``MFCC_WINDOW_SHIFT_*``
@@ -862,12 +911,15 @@ class RuntimeConfiguration(Configuration):
         (MFCC_MASK_LOG_ENERGY_THRESHOLD, (0.699, float, [], u"when masking MFCC, log energy threshold for speech")),
         (MFCC_MASK_MIN_NONSPEECH_LENGTH, (1, int, [], u"when masking MFCC, min nonspeech interval length, in frames")),
 
+        (DTW_MARGIN_L1, ("60.000", TimeValue, [], u"level 1 (para) DTW margin, in s")),
         (MFCC_MASK_NONSPEECH_L1, (False, bool, [], u"if True, mask MFCC nonspeech frames on level 1 (para)")),
         (MFCC_WINDOW_LENGTH_L1, ("0.100", TimeValue, [], u"level 1 (para) MFCC window length, in s")),
         (MFCC_WINDOW_SHIFT_L1, ("0.040", TimeValue, [], u"level 1 (para) MFCC window shift, in s")),
+        (DTW_MARGIN_L2, ("30.000", TimeValue, [], u"level 2 (sent) DTW margin, in s")),
         (MFCC_MASK_NONSPEECH_L2, (False, bool, [], u"if True, mask MFCC nonspeech frames on level 2 (sent)")),
         (MFCC_WINDOW_LENGTH_L2, ("0.050", TimeValue, [], u"level 2 (sent) MFCC window length, in s")),
         (MFCC_WINDOW_SHIFT_L2, ("0.020", TimeValue, [], u"level 2 (sent) MFCC window shift, in s")),
+        (DTW_MARGIN_L3, ("10.000", TimeValue, [], u"level 3 (word) DTW margin, in s")),
         (MFCC_MASK_NONSPEECH_L3, (False, bool, [], u"if True, mask MFCC nonspeech frames on level 3 (word)")),
         (MFCC_WINDOW_LENGTH_L3, ("0.020", TimeValue, [], u"level 3 (word) MFCC window length, in s")),
         (MFCC_WINDOW_SHIFT_L3, ("0.005", TimeValue, [], u"level 3 (word) MFCC window shift, in s")),
@@ -931,6 +983,17 @@ def sample_rate(self):
         """
         return self[self.FFMPEG_SAMPLE_RATE]
 
+    @property
+    def dtw_margin(self):
+        """
+        Return the value of the
+        :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.DTW_MARGIN`
+        key stored in this configuration object.
+
+        :rtype: :class:`~aeneas.exacttiming.TimeValue`
+        """
+        return self[self.DTW_MARGIN]
+
     @property
     def mmn(self):
         """
@@ -1003,7 +1066,8 @@ def set_granularity(self, level):
         :param int level: the desired granularity level
         """
         if level in self.MFCC_GRANULARITY_MAP.keys():
-            mask_key, length_key, shift_key = self.MFCC_GRANULARITY_MAP[level]
+            margin_key, mask_key, length_key, shift_key = self.MFCC_GRANULARITY_MAP[level]
+            self[self.DTW_MARGIN] = self[margin_key]
             self[self.MFCC_MASK_NONSPEECH] = self[mask_key]
             self[self.MFCC_WINDOW_LENGTH] = self[length_key]
             self[self.MFCC_WINDOW_SHIFT] = self[shift_key]

diff --git a/aeneas/tests/test_runtimeconfiguration.py b/aeneas/tests/test_runtimeconfiguration.py
@@ -41,6 +41,10 @@ def test_sample_rate(self):
         rconf = RuntimeConfiguration()
         self.assertEqual(rconf.sample_rate, 16000)
 
+    def test_dtw_margin(self):
+        rconf = RuntimeConfiguration()
+        self.assertEqual(rconf.dtw_margin, TimeValue("60.000"))
+
     def test_mmn(self):
         rconf = RuntimeConfiguration()
         self.assertEqual(rconf.mmn, False)
@@ -120,10 +124,16 @@ def test_set_rconf_string(self):
             (u"mfcc_mask_nonspeech=True", "mfcc_mask_nonspeech", True),
             (u"mfcc_window_length=0.360", "mfcc_window_length", TimeValue("0.360")),
             (u"mfcc_window_shift=0.160", "mfcc_window_shift", TimeValue("0.160")),
+            (u"dtw_margin_l1=100", "dtw_margin_l1", TimeValue("100")),
+            (u"mfcc_mask_nonspeech_l1=True", "mfcc_mask_nonspeech_l1", True),
             (u"mfcc_window_length_l1=0.360", "mfcc_window_length_l1", TimeValue("0.360")),
             (u"mfcc_window_shift_l1=0.160", "mfcc_window_shift_l1", TimeValue("0.160")),
+            (u"dtw_margin_l2=30", "dtw_margin_l2", TimeValue("30")),
+            (u"mfcc_mask_nonspeech_l2=True", "mfcc_mask_nonspeech_l2", True),
             (u"mfcc_window_length_l2=0.360", "mfcc_window_length_l2", TimeValue("0.360")),
             (u"mfcc_window_shift_l2=0.160", "mfcc_window_shift_l2", TimeValue("0.160")),
+            (u"dtw_margin_l3=10", "dtw_margin_l3", TimeValue("10")),
+            (u"mfcc_mask_nonspeech_l3=True", "mfcc_mask_nonspeech_l3", True),
             (u"mfcc_window_length_l3=0.360", "mfcc_window_length_l3", TimeValue("0.360")),
             (u"mfcc_window_shift_l3=0.160", "mfcc_window_shift_l3", TimeValue("0.160")),
             (u"mfcc_mask_extend_speech_after=1", "mfcc_mask_extend_speech_after", 1),

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -36,13 +36,15 @@ v1.7.0 (2016-12-??)
 #. Added more tool tests
 #. Added ``wiki/TESTING.md``
 #. Added ``venvs`` directory with scripts to automate testing with virtual environments
-#. Added ``RuntimeConfiguration`` parameters to switch MFCC masking on
+#. Added ``RuntimeConfiguration`` parameters to switch MFCC masking on, including per level in multilevel tasks
 #. Modified ``DTWAligner``, ``AudioFileMFCC``, ``ExecuteTask``, and ``VAD`` to allow MFCC masking
 #. Added field human-readable descriptions in ``Configuration`` and its subclasses
 #. Better ``--list-parameters`` in ``ExecuteTask``
 #. Added ``--list-parameters`` in ``ExecuteJob``
 #. Added ``--help-rconf`` option to all tools
 #. Added check in ``ExecuteTask`` on the consistency of the computed sync map
+#. Added ``RuntimeConfiguration`` parameters ``DTW_MARGIN_L1``, ``DTW_MARGIN_L2``, ``DTW_MARGIN_L3``, to change DTW margin of each level
+#. Added ``FFMPEG_PARAMETERS_SAMPLE_48000`` to ``ffmpegwrapper.py``
 
 v1.6.0.1 (2016-09-30)
 ---------------------

diff --git a/docs/source/libtutorial.rst b/docs/source/libtutorial.rst
@@ -78,7 +78,7 @@ Clearly, you can also manipulate objects programmatically.
         ExecuteTask(task).execute()
 
         # print fragments with a duration < 5 seconds
-        for fragment in task.sync_map_vleaves:
+        for fragment in task.sync_map_leaves():
             if fragment.length < 5.0:
                 print(fragment)