Skip to content

Commit

Permalink
Merge pull request readbeyond#17 from pettarin/devel
Browse files Browse the repository at this point in the history
Added id regex for subtitles, plain. Close readbeyond#14
  • Loading branch information
readbeyond committed Oct 15, 2015
2 parents ff4971c + ada6c98 commit 9ff1430
Show file tree
Hide file tree
Showing 11 changed files with 137 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*.pyo
*.swp
*.so
MANIFEST
aeneas.egg-info
aeneas/build
bak
build
Expand Down
10 changes: 10 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
recursive-include aeneas/tools/res *
include check_dependencies.py
recursive-include docs *
prune docs/build
include LICENSE
recursive-include licenses *
include README.md
include README.txt
include requirements.txt
include VERSION
21 changes: 21 additions & 0 deletions aeneas/globalconstants.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,27 @@
"""

PPN_TASK_OS_FILE_ID_REGEX = "os_task_file_id_regex"
"""
Key for the regex to be used for the fragment identifiers
of the sync map output file.
This parameter will be used only
when the input text file has `plain` or `subtitles` format;
for `parsed` and `unparsed` input text files, the identifiers
contained in the input text file will be used instead.
Usage: config string, TXT config file, XML config file
Values: string
Example::
os_task_file_id_regex=f%06d
os_task_file_id_regex=Word%03d
.. versionadded:: 1.3.1
"""

PPN_TASK_OS_FILE_NAME = "os_task_file_name"
"""
Key for the file name of the sync map output file
Expand Down
15 changes: 15 additions & 0 deletions aeneas/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def _populate_text_file(self):
parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX] = self.configuration.is_text_unparsed_class_regex
parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = self.configuration.is_text_unparsed_id_regex
parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = self.configuration.is_text_unparsed_id_sort
parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] = self.configuration.os_file_id_regex
self.text_file = TextFile(
file_path=self.text_file_path_absolute,
file_format=self.configuration.is_text_file_format,
Expand Down Expand Up @@ -275,6 +276,7 @@ def __init__(self, config_string=None):
gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,

gc.PPN_TASK_OS_FILE_FORMAT,
gc.PPN_TASK_OS_FILE_ID_REGEX,
gc.PPN_TASK_OS_FILE_NAME,
gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF,
gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF,
Expand Down Expand Up @@ -631,6 +633,19 @@ def os_file_format(self):
def os_file_format(self, value):
self.fields[gc.PPN_TASK_OS_FILE_FORMAT] = value

@property
def os_file_id_regex(self):
"""
The regex to be used for the fragment identifiers
of the sync map output file
:rtype: string
"""
return self.fields[gc.PPN_TASK_OS_FILE_ID_REGEX]
@os_file_id_regex.setter
def os_file_id_regex(self, value):
self.fields[gc.PPN_TASK_OS_FILE_ID_REGEX] = value

@property
def os_file_name(self):
"""
Expand Down
4 changes: 4 additions & 0 deletions aeneas/tests/res/container/empty_dir/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore
39 changes: 39 additions & 0 deletions aeneas/tests/test_textfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ class TestTextFile(unittest.TestCase):
gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX : "ra",
gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT : IDSortingAlgorithm.UNSORTED,
}
ID_REGEX_PARAMETERS = {
gc.PPN_TASK_OS_FILE_ID_REGEX : "word%06d"
}
ID_REGEX_PARAMETERS_BAD = {
gc.PPN_TASK_OS_FILE_ID_REGEX : "word"
}

def load(self, input_file_path=PLAIN_FILE_PATH, fmt=TextFileFormat.PLAIN, expected_length=15, parameters=None):
tfl = TextFile(at.get_abs_path(input_file_path), fmt, parameters)
Expand Down Expand Up @@ -140,12 +146,45 @@ def test_read_subtitles(self):
]:
self.load(path, TextFileFormat.SUBTITLES, 15)

def test_read_subtitles_id_regex(self):
for path in [
"res/inputtext/sonnet_subtitles_with_end_newline.txt",
"res/inputtext/sonnet_subtitles_no_end_newline.txt",
"res/inputtext/sonnet_subtitles_multiple_blank.txt",
"res/inputtext/sonnet_subtitles_multiple_rows.txt"
]:
self.load(path, TextFileFormat.SUBTITLES, 15, self.ID_REGEX_PARAMETERS)

def test_read_subtitles_id_regex_bad(self):
with self.assertRaises(TypeError):
for path in [
"res/inputtext/sonnet_subtitles_with_end_newline.txt",
"res/inputtext/sonnet_subtitles_no_end_newline.txt",
"res/inputtext/sonnet_subtitles_multiple_blank.txt",
"res/inputtext/sonnet_subtitles_multiple_rows.txt"
]:
self.load(path, TextFileFormat.SUBTITLES, 15, self.ID_REGEX_PARAMETERS_BAD)

def test_read_plain(self):
self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15)

def test_read_plain_id_regex(self):
self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS)

def test_read_plain_id_regex_bad(self):
with self.assertRaises(TypeError):
self.load("res/inputtext/sonnet_plain.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS_BAD)

def test_read_plain_utf8(self):
self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15)

def test_read_plain_utf8_id_regex(self):
self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS)

def test_read_plain_utf8_id_regex_bad(self):
with self.assertRaises(TypeError):
self.load("res/inputtext/sonnet_plain_utf8.txt", TextFileFormat.PLAIN, 15, self.ID_REGEX_PARAMETERS_BAD)

def test_read_parsed(self):
self.load("res/inputtext/sonnet_parsed.txt", TextFileFormat.PARSED, 15)

Expand Down
48 changes: 35 additions & 13 deletions aeneas/textfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ class TextFile(object):
:raise ValueError: if ``file_format`` value is not allowed
"""

DEFAULT_ID_REGEX = u"f%06d"

TAG = "TextFile"

def __init__(
Expand All @@ -274,6 +276,8 @@ def __init__(
self.logger = Logger()
if logger is not None:
self.logger = logger
if self.parameters is None:
self.parameters = dict()
if (self.file_path is not None) and (self.file_format is not None):
self._read_from_file()

Expand Down Expand Up @@ -486,7 +490,7 @@ def _read_from_file(self):
self._read_plain(lines)
if self.file_format == TextFileFormat.UNPARSED:
self._log("Reading from format UNPARSED")
self._read_unparsed(lines, self.parameters)
self._read_unparsed(lines)

# log the number of fragments
self._log(["Parsed %d fragments", len(self.fragments)])
Expand All @@ -499,6 +503,7 @@ def _read_subtitles(self, lines):
:type lines: list of strings
"""
self._log("Parsing fragments from subtitles text format")
id_regex = self._get_id_regex()
lines = [line.strip() for line in lines]
pairs = []
i = 1
Expand All @@ -511,7 +516,7 @@ def _read_subtitles(self, lines):
while (following < len(lines) and (len(lines[following]) > 0)):
fragment_lines.append(lines[following])
following += 1
identifier = u"f" + str(i).zfill(6)
identifier = id_regex % i
pairs.append([identifier, fragment_lines])
current = following
i += 1
Expand All @@ -524,6 +529,9 @@ def _read_parsed(self, lines):
:param lines: the lines of the parsed text file
:type lines: list of strings
:param parameters: additional parameters for parsing
(e.g., class/id regex strings)
:type parameters: dict
"""
self._log("Parsing fragments from parsed text format")
pairs = []
Expand All @@ -542,27 +550,28 @@ def _read_plain(self, lines):
:param lines: the lines of the plain text file
:type lines: list of strings
:param parameters: additional parameters for parsing
(e.g., class/id regex strings)
:type parameters: dict
"""
self._log("Parsing fragments from plain text format")
id_regex = self._get_id_regex()
lines = [line.strip() for line in lines]
pairs = []
i = 1
for line in lines:
identifier = u"f" + str(i).zfill(6)
identifier = id_regex % i
text = line.strip()
pairs.append([identifier, [text]])
i += 1
self._create_text_fragments(pairs)

def _read_unparsed(self, lines, parameters):
def _read_unparsed(self, lines):
"""
Read text fragments from an unparsed format text file.
:param lines: the lines of the unparsed text file
:type lines: list of strings
:param parameters: additional parameters for parsing
(e.g., class/id regex strings)
:type parameters: dict
"""
#
# TODO better and/or parametric parsing,
Expand All @@ -573,23 +582,23 @@ def _read_unparsed(self, lines, parameters):

# get filter attributes
attributes = dict()
if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in parameters:
class_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX]
if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in self.parameters:
class_regex_string = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX]
if class_regex_string is not None:
self._log(["Regex for class: '%s'", class_regex_string])
class_regex = re.compile(r".*\b" + class_regex_string + r"\b.*")
attributes['class'] = class_regex
if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in parameters:
id_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX]
if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in self.parameters:
id_regex_string = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX]
if id_regex_string is not None:
self._log(["Regex for id: '%s'", id_regex_string])
id_regex = re.compile(r".*\b" + id_regex_string + r"\b.*")
attributes['id'] = id_regex

# get id sorting algorithm
id_sort = IDSortingAlgorithm.UNSORTED
if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in parameters:
id_sort = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT]
if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in self.parameters:
id_sort = self.parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT]
self._log(["Sorting text fragments using '%s'", id_sort])

# transform text in a soup object
Expand Down Expand Up @@ -632,5 +641,18 @@ def _create_text_fragments(self, pairs):
fragment = TextFragment(identifier=pair[0], lines=pair[1])
self.append_fragment(fragment)

def _get_id_regex(self):
"""
Get the id regex.
"""
id_regex = self.DEFAULT_ID_REGEX
if (
(gc.PPN_TASK_OS_FILE_ID_REGEX in self.parameters) and
(self.parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] is not None)
):
id_regex = u"" + self.parameters[gc.PPN_TASK_OS_FILE_ID_REGEX]
self._log(["id_regex is %s", id_regex])
return id_regex



4 changes: 2 additions & 2 deletions aeneas/tools/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ def usage():

def main():
""" Entry point """

if ("-h" in sys.argv) or ("--help" in sys.argv):
# show full help
usage()

if len(sys.argv) < 3:
usage()
verbose = False
best_audio = True
source_url = sys.argv[1]
Expand Down
3 changes: 2 additions & 1 deletion aeneas/tools/execute_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def get_parameters():
print " is_text_unparsed_id_sort : sort matched elements by id (unparsed) (*)"
print ""
print " os_task_file_format : output sync map format (*)"
print " os_task_file_id_regex : id regex for the output sync map (subtitles, plain)"
print " os_task_file_head_tail_format : format audio head/tail (*)"
print " os_task_file_smil_audio_ref : value for the audio ref (smil, smilh, smilm)"
print " os_task_file_smil_page_ref : value for the text ref (smil, smilh, smilm)"
Expand Down Expand Up @@ -292,7 +293,7 @@ def run(argv):

try:
print "[INFO] Creating task..."
task = Task(config_string)
task = Task(config_string, logger=logger)
task.audio_file_path_absolute = audio_file_path
task.text_file_path_absolute = text_file_path
task.sync_map_file_path_absolute = sync_map_file_path
Expand Down
6 changes: 5 additions & 1 deletion aeneas/tools/read_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,17 @@ def usage():
print "Parameters:"
print " -v : verbose output"
print " class_regex=REGEX : extract text from elements with class attribute matching REGEX"
print " id_regex=REGEX : extract text from elements with id attribute matching REGEX"
print " id_regex=REGEX : extract text from elements with id attribute matching REGEX (unparsed)"
print " id_regex=REGEX : use REGEX for text id attributes (subtitles, plain)"
print " sort=ALGORITHM : sort the matched element id attributes using ALGORITHM (lexicographic, numeric, unsorted)"
print ""
print "Examples:"
print " $ python -m %s 'From|fairest|creatures|we|desire|increase' list" % NAME
print " $ python -m %s %s parsed" % (NAME, TEXT_FILE_PARSED)
print " $ python -m %s %s plain" % (NAME, TEXT_FILE_PLAIN)
print " $ python -m %s %s plain id_regex=Word%%06d" % (NAME, TEXT_FILE_PLAIN)
print " $ python -m %s %s subtitles" % (NAME, TEXT_FILE_SUBTITLES)
print " $ python -m %s %s subtitles id_regex=Sub%%03d" % (NAME, TEXT_FILE_SUBTITLES)
print " $ python -m %s %s unparsed id_regex=f[0-9]*" % (NAME, TEXT_FILE_UNPARSED)
print " $ python -m %s %s unparsed class_regex=ra sort=unsorted" % (NAME, TEXT_FILE_UNPARSED)
print " $ python -m %s %s unparsed id_regex=f[0-9]* sort=numeric" % (NAME, TEXT_FILE_UNPARSED)
Expand All @@ -74,6 +77,7 @@ def main():
key, value = args
if key == "id_regex":
parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value
parameters[gc.PPN_TASK_OS_FILE_ID_REGEX] = value
if key == "class_regex":
parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value
if key == "sort":
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@

setup(
name="aeneas",
packages=["aeneas", "aeneas.tests", "aeneas.tools"],
packages=["aeneas", "aeneas.tools"],
version="1.3.0",
description="aeneas is a Python library and a set of tools to automagically synchronize audio and text",
author="Alberto Pettarin",
author_email="[email protected]",
url="https://github.com/readbeyond/aeneas",
license="GNU Affero General Public License v3 (AGPL v3)",
long_description=open("README.txt").read(),
long_description=open("README.txt", "r").read(),
install_requires=["BeautifulSoup", "lxml", "numpy", "pafy"],
keywords=[
"CSV",
"DTW",
Expand Down

0 comments on commit 9ff1430

Please sign in to comment.