Skip to content

Commit

Permalink
fix: updated vs original in time elements
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Mar 13, 2023
1 parent b55fd2f commit 9c4db3e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 21 deletions.
40 changes: 23 additions & 17 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,23 +571,31 @@ def examine_time_elements(
# go for datetime
if "datetime" in elem.attrib and len(elem.get("datetime")) > 6:
# shortcut: time pubdate
if "pubdate" in elem.attrib and elem.get("pubdate") == "pubdate":
if original_date:
shortcut_flag = True
LOGGER.debug("time pubdate found: %s", elem.get("datetime"))
# first choice: entry-date + datetime attribute
if (
"pubdate" in elem.attrib
and elem.get("pubdate") == "pubdate"
and original_date
):
shortcut_flag = True
LOGGER.debug(
"shortcut for time pubdate found: %s", elem.get("datetime")
)
# shortcuts: class attribute
elif "class" in elem.attrib:
if elem.get("class").startswith("entry-date") or elem.get(
"class"
).startswith("entry-time"):
# shortcut
if original_date:
shortcut_flag = True
LOGGER.debug("time/datetime found: %s", elem.get("datetime"))
if original_date and (
elem.get("class").startswith("entry-date")
or elem.get("class").startswith("entry-time")
):
shortcut_flag = True
LOGGER.debug(
"shortcut for time/datetime found: %s", elem.get("datetime")
)
# updated time
elif elem.get("class") == "updated" and not original_date:
elif not original_date and elem.get("class") == "updated":
shortcut_flag = True
LOGGER.debug(
"updated time/datetime found: %s", elem.get("datetime")
"shortcut for updated time/datetime found: %s",
elem.get("datetime"),
)
# datetime attribute
else:
Expand All @@ -613,11 +621,9 @@ def examine_time_elements(
min_date,
max_date,
)
if reference > 0:
break
# bare text in element
elif elem.text is not None and len(elem.text) > 6:
LOGGER.debug("time/datetime found: %s", elem.text)
LOGGER.debug("time/datetime found in text: %s", elem.text)
reference = compare_reference(
reference,
elem.text,
Expand Down
8 changes: 4 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ def test_exact_date():
)
== "2011-09-27"
)
# problem here:
# assert find_date('<html><body><time datetime="2011-09-27" class="entry-date"></time><time datetime="2011-09-28" class="updated"></time></body></html>', original_date=False) == '2011-09-28'
# assert find_date('<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>', original_date=True) == '2011-09-27'
# updated vs original in time elements
assert find_date('<html><body><time datetime="2011-09-27" class="entry-date"></time><time datetime="2011-09-28" class="updated"></time></body></html>', original_date=False) == '2011-09-28'
assert find_date('<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>', original_date=True) == '2011-09-27'
assert (
find_date(
'<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>',
Expand Down Expand Up @@ -1151,7 +1151,7 @@ def test_external_date_parser():
)
assert external_date_parser("Random text with 2020", OUTPUTFORMAT) is None
# https://github.com/scrapinghub/dateparser/issues/333
# assert external_date_parser('1 January 0001', '%d %B %Y') in ('01 January 1', '01 January 0001')
assert external_date_parser('1 January 0001', '%d %B %Y') in ('01 January 1', '01 January 0001')
assert external_date_parser("1 January 1900", "%d %B %Y") == "01 January 1900"
# https://github.com/scrapinghub/dateparser/issues/406
assert (
Expand Down

0 comments on commit 9c4db3e

Please sign in to comment.