Merge pull request readbeyond#17 from pettarin/devel

Added id regex for subtitles, plain. Close readbeyond#14
a-1an · Oct 15, 2015 · 9ff1430 · 9ff1430
2 parents ff4971c + ada6c98
commit 9ff1430
Show file tree

Hide file tree

Showing 11 changed files with 137 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,7 @@
 *.pyo
 *.swp
 *.so
-MANIFEST
+aeneas.egg-info
 aeneas/build
 bak
 build

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,10 @@
+recursive-include aeneas/tools/res *
+include check_dependencies.py
+recursive-include docs *
+prune docs/build
+include LICENSE
+recursive-include licenses *
+include README.md
+include README.txt
+include requirements.txt
+include VERSION
diff --git a/aeneas/globalconstants.py b/aeneas/globalconstants.py
@@ -721,6 +721,27 @@
 
 """
 
+PPN_TASK_OS_FILE_ID_REGEX = "os_task_file_id_regex"
+"""
+Key for the regex to be used for the fragment identifiers
+of the sync map output file.
+This parameter will be used only
+when the input text file has `plain` or `subtitles` format;
+for `parsed` and `unparsed` input text files, the identifiers
+contained in the input text file will be used instead.
+
+Usage: config string, TXT config file, XML config file
+
+Values: string
+
+Example::
+
+    os_task_file_id_regex=f%06d
+    os_task_file_id_regex=Word%03d
+
+.. versionadded:: 1.3.1
+"""
+
 PPN_TASK_OS_FILE_NAME = "os_task_file_name"
 """
 Key for the file name of the sync map output file

diff --git a/aeneas/task.py b/aeneas/task.py
@@ -216,6 +216,7 @@ def _populate_text_file(self):
             parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX] = self.configuration.is_text_unparsed_class_regex
             parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = self.configuration.is_text_unparsed_id_regex
             parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = self.configuration.is_text_unparsed_id_sort
+            parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] = self.configuration.os_file_id_regex
             self.text_file = TextFile(
                 file_path=self.text_file_path_absolute,
                 file_format=self.configuration.is_text_file_format,
@@ -275,6 +276,7 @@ def __init__(self, config_string=None):
             gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,
 
             gc.PPN_TASK_OS_FILE_FORMAT,
+            gc.PPN_TASK_OS_FILE_ID_REGEX,
             gc.PPN_TASK_OS_FILE_NAME,
             gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF,
             gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF,
@@ -631,6 +633,19 @@ def os_file_format(self):
     def os_file_format(self, value):
         self.fields[gc.PPN_TASK_OS_FILE_FORMAT] = value
 
+    @property
+    def os_file_id_regex(self):
+        """
+        The regex to be used for the fragment identifiers
+        of the sync map output file
+
+        :rtype: string
+        """
+        return self.fields[gc.PPN_TASK_OS_FILE_ID_REGEX]
+    @os_file_id_regex.setter
+    def os_file_id_regex(self, value):
+        self.fields[gc.PPN_TASK_OS_FILE_ID_REGEX] = value
+
     @property
     def os_file_name(self):
         """

diff --git a/aeneas/tests/res/container/empty_dir/.gitignore b/aeneas/tests/res/container/empty_dir/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/aeneas/tests/test_textfile.py b/aeneas/tests/test_textfile.py
@@ -23,6 +23,12 @@ class TestTextFile(unittest.TestCase):
         gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX : "ra",
         gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT : IDSortingAlgorithm.UNSORTED,
     }
+    ID_REGEX_PARAMETERS = {
+        gc.PPN_TASK_OS_FILE_ID_REGEX : "word%06d"
+    }
+    ID_REGEX_PARAMETERS_BAD = {
+        gc.PPN_TASK_OS_FILE_ID_REGEX : "word"
+    }
 
     def load(self, input_file_path=PLAIN_FILE_PATH, fmt=TextFileFormat.PLAIN, expected_length=15, parameters=None):
         tfl = TextFile(at.get_abs_path(input_file_path), fmt, parameters)
@@ -140,12 +146,45 @@ def test_read_subtitles(self):
         ]:
             self.load(path, TextFileFormat.SUBTITLES, 15)
 
+    def test_read_subtitles_id_regex(self):
+        for path in [
+                "res/inputtext/sonnet_subtitles_with_end_newline.txt",
+                "res/inputtext/sonnet_subtitles_no_end_newline.txt",
+                "res/inputtext/sonnet_subtitles_multiple_blank.txt",
+                "res/inputtext/sonnet_subtitles_multiple_rows.txt"
+        ]:
+            self.load(path, TextFileFormat.SUBTITLES, 15, self.ID_REGEX_PARAMETERS)
+
+    def test_read_subtitles_id_regex_bad(self):
+        with self.assertRaises(TypeError):
+            for path in [
+                    "res/inputtext/sonnet_subtitles_with_end_newline.txt",
+                    "res/inputtext/sonnet_subtitles_no_end_newline.txt",
+                    "res/inputtext/sonnet_subtitles_multiple_blank.txt",
+                    "res/inputtext/sonnet_subtitles_multiple_rows.txt"
+            ]:
+                self.load(path, TextFileFormat.SUBTITLES, 15, self.ID_REGEX_PARAMETERS_BAD)
+
     def test_read_plain(self):
         self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15)
 
+    def test_read_plain_id_regex(self):
+        self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS)
+
+    def test_read_plain_id_regex_bad(self):
+        with self.assertRaises(TypeError):
+            self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS_BAD)
+
     def test_read_plain_utf8(self):
         self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15)
 
+    def test_read_plain_utf8_id_regex(self):
+        self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS)
+
+    def test_read_plain_utf8_id_regex_bad(self):
+        with self.assertRaises(TypeError):
+            self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS_BAD)
+
     def test_read_parsed(self):
         self.load("res/inputtext/sonnet_parsed.txt", TextFileFormat.PARSED, 15)
 

diff --git a/aeneas/textfile.py b/aeneas/textfile.py
@@ -258,6 +258,8 @@ class TextFile(object):
     :raise ValueError: if ``file_format`` value is not allowed
     """
 
+    DEFAULT_ID_REGEX = u"f%06d"
+
     TAG = "TextFile"
 
     def __init__(
@@ -274,6 +276,8 @@ def __init__(
         self.logger = Logger()
         if logger is not None:
             self.logger = logger
+        if self.parameters is None:
+            self.parameters = dict()
         if (self.file_path is not None) and (self.file_format is not None):
             self._read_from_file()
 
@@ -486,7 +490,7 @@ def _read_from_file(self):
             self._read_plain(lines)
         if self.file_format == TextFileFormat.UNPARSED:
             self._log("Reading from format UNPARSED")
-            self._read_unparsed(lines, self.parameters)
+            self._read_unparsed(lines)
 
         # log the number of fragments
         self._log(["Parsed %d fragments", len(self.fragments)])
@@ -499,6 +503,7 @@ def _read_subtitles(self, lines):
         :type  lines: list of strings
         """
         self._log("Parsing fragments from subtitles text format")
+        id_regex = self._get_id_regex()
         lines = [line.strip() for line in lines]
         pairs = []
         i = 1
@@ -511,7 +516,7 @@ def _read_subtitles(self, lines):
                 while (following < len(lines) and (len(lines[following]) > 0)):
                     fragment_lines.append(lines[following])
                     following += 1
-                identifier = u"f" + str(i).zfill(6)
+                identifier = id_regex % i 
                 pairs.append([identifier, fragment_lines])
                 current = following
                 i += 1
@@ -524,6 +529,9 @@ def _read_parsed(self, lines):
 
         :param lines: the lines of the parsed text file
         :type  lines: list of strings
+        :param parameters: additional parameters for parsing
+                           (e.g., class/id regex strings)
+        :type  parameters: dict
         """
         self._log("Parsing fragments from parsed text format")
         pairs = []
@@ -542,27 +550,28 @@ def _read_plain(self, lines):
 
         :param lines: the lines of the plain text file
         :type  lines: list of strings
+        :param parameters: additional parameters for parsing
+                           (e.g., class/id regex strings)
+        :type  parameters: dict
         """
         self._log("Parsing fragments from plain text format")
+        id_regex = self._get_id_regex()
         lines = [line.strip() for line in lines]
         pairs = []
         i = 1
         for line in lines:
-            identifier = u"f" + str(i).zfill(6)
+            identifier = id_regex % i
             text = line.strip()
             pairs.append([identifier, [text]])
             i += 1
         self._create_text_fragments(pairs)
 
-    def _read_unparsed(self, lines, parameters):
+    def _read_unparsed(self, lines):
         """
         Read text fragments from an unparsed format text file.
 
         :param lines: the lines of the unparsed text file
         :type  lines: list of strings
-        :param parameters: additional parameters for parsing
-                           (e.g., class/id regex strings)
-        :type  parameters: dict
         """
         #
         # TODO better and/or parametric parsing,
@@ -573,23 +582,23 @@ def _read_unparsed(self, lines, parameters):
 
         # get filter attributes
         attributes = dict()
-        if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in parameters:
-            class_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX]
+        if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in self.parameters:
+            class_regex_string = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX]
             if class_regex_string is not None:
                 self._log(["Regex for class: '%s'", class_regex_string])
                 class_regex = re.compile(r".*\b" + class_regex_string + r"\b.*")
                 attributes['class'] = class_regex
-        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in parameters:
-            id_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX]
+        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in self.parameters:
+            id_regex_string = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX]
             if id_regex_string is not None:
                 self._log(["Regex for id: '%s'", id_regex_string])
                 id_regex = re.compile(r".*\b" + id_regex_string + r"\b.*")
                 attributes['id'] = id_regex
 
         # get id sorting algorithm
         id_sort = IDSortingAlgorithm.UNSORTED
-        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in parameters:
-            id_sort = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT]
+        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in self.parameters:
+            id_sort = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT]
         self._log(["Sorting text fragments using '%s'", id_sort])
 
         # transform text in a soup object
@@ -632,5 +641,18 @@ def _create_text_fragments(self, pairs):
             fragment = TextFragment(identifier=pair[0], lines=pair[1])
             self.append_fragment(fragment)
 
+    def _get_id_regex(self):
+        """
+        Get the id regex.
+        """
+        id_regex = self.DEFAULT_ID_REGEX 
+        if (
+                (gc.PPN_TASK_OS_FILE_ID_REGEX in self.parameters) and
+                (self.parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] is not None)
+        ):
+            id_regex = u"" + self.parameters[gc.PPN_TASK_OS_FILE_ID_REGEX]
+        self._log(["id_regex is %s", id_regex])
+        return id_regex
+
 
 
diff --git a/aeneas/tools/download.py b/aeneas/tools/download.py
@@ -58,11 +58,11 @@ def usage():
 
 def main():
     """ Entry point """
-
     if ("-h" in sys.argv) or ("--help" in sys.argv):
         # show full help
         usage()
-
+    if len(sys.argv) < 3:
+        usage()
     verbose = False
     best_audio = True
     source_url = sys.argv[1]

diff --git a/aeneas/tools/execute_task.py b/aeneas/tools/execute_task.py
@@ -103,6 +103,7 @@ def get_parameters():
     print "  is_text_unparsed_id_sort                : sort matched elements by id (unparsed) (*)"
     print ""
     print "  os_task_file_format                     : output sync map format (*)"
+    print "  os_task_file_id_regex                   : id regex for the output sync map (subtitles, plain)"
     print "  os_task_file_head_tail_format           : format audio head/tail (*)"
     print "  os_task_file_smil_audio_ref             : value for the audio ref (smil, smilh, smilm)"
     print "  os_task_file_smil_page_ref              : value for the text ref (smil, smilh, smilm)"
@@ -292,7 +293,7 @@ def run(argv):
 
     try:
         print "[INFO] Creating task..."
-        task = Task(config_string)
+        task = Task(config_string, logger=logger)
         task.audio_file_path_absolute = audio_file_path
         task.text_file_path_absolute = text_file_path
         task.sync_map_file_path_absolute = sync_map_file_path

diff --git a/aeneas/tools/read_text.py b/aeneas/tools/read_text.py
@@ -40,14 +40,17 @@ def usage():
     print "Parameters:"
     print "  -v                : verbose output"
     print "  class_regex=REGEX : extract text from elements with class attribute matching REGEX"
-    print "  id_regex=REGEX    : extract text from elements with id attribute matching REGEX"
+    print "  id_regex=REGEX    : extract text from elements with id attribute matching REGEX (unparsed)"
+    print "  id_regex=REGEX    : use REGEX for text id attributes (subtitles, plain)"
     print "  sort=ALGORITHM    : sort the matched element id attributes using ALGORITHM (lexicographic, numeric, unsorted)"
     print ""
     print "Examples:"
     print "  $ python -m %s 'From|fairest|creatures|we|desire|increase' list" % NAME
     print "  $ python -m %s %s parsed" % (NAME, TEXT_FILE_PARSED)
     print "  $ python -m %s %s plain" % (NAME, TEXT_FILE_PLAIN)
+    print "  $ python -m %s %s plain id_regex=Word%%06d" % (NAME, TEXT_FILE_PLAIN)
     print "  $ python -m %s %s subtitles" % (NAME, TEXT_FILE_SUBTITLES)
+    print "  $ python -m %s %s subtitles id_regex=Sub%%03d" % (NAME, TEXT_FILE_SUBTITLES)
     print "  $ python -m %s %s unparsed id_regex=f[0-9]*" % (NAME, TEXT_FILE_UNPARSED)
     print "  $ python -m %s %s unparsed class_regex=ra   sort=unsorted" % (NAME, TEXT_FILE_UNPARSED)
     print "  $ python -m %s %s unparsed id_regex=f[0-9]* sort=numeric" % (NAME, TEXT_FILE_UNPARSED)
@@ -74,6 +77,7 @@ def main():
                 key, value = args
                 if key == "id_regex":
                     parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value
+                    parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] = value
                 if key == "class_regex":
                     parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value
                 if key == "sort":

diff --git a/setup.py b/setup.py
@@ -31,14 +31,15 @@
 
 setup(
     name="aeneas",
-    packages=["aeneas", "aeneas.tests", "aeneas.tools"],
+    packages=["aeneas", "aeneas.tools"],
     version="1.3.0",
     description="aeneas is a Python library and a set of tools to automagically synchronize audio and text",
     author="Alberto Pettarin",
     author_email="[email protected]",
     url="https://github.com/readbeyond/aeneas",
     license="GNU Affero General Public License v3 (AGPL v3)",
-    long_description=open("README.txt").read(),
+    long_description=open("README.txt", "r").read(),
+    install_requires=["BeautifulSoup", "lxml", "numpy", "pafy"],
     keywords=[
         "CSV",
         "DTW",
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ @@
     *.pyo
     *.swp
     *.so
-    MANIFEST
+    aeneas.egg-info
     aeneas/build
     bak
     build
@@ Expand Down @@