diff --git a/htmldate/core.py b/htmldate/core.py index f68477f8..96501528 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -571,23 +571,31 @@ def examine_time_elements( # go for datetime if "datetime" in elem.attrib and len(elem.get("datetime")) > 6: # shortcut: time pubdate - if "pubdate" in elem.attrib and elem.get("pubdate") == "pubdate": - if original_date: - shortcut_flag = True - LOGGER.debug("time pubdate found: %s", elem.get("datetime")) - # first choice: entry-date + datetime attribute + if ( + "pubdate" in elem.attrib + and elem.get("pubdate") == "pubdate" + and original_date + ): + shortcut_flag = True + LOGGER.debug( + "shortcut for time pubdate found: %s", elem.get("datetime") + ) + # shortcuts: class attribute elif "class" in elem.attrib: - if elem.get("class").startswith("entry-date") or elem.get( - "class" - ).startswith("entry-time"): - # shortcut - if original_date: - shortcut_flag = True - LOGGER.debug("time/datetime found: %s", elem.get("datetime")) + if original_date and ( + elem.get("class").startswith("entry-date") + or elem.get("class").startswith("entry-time") + ): + shortcut_flag = True + LOGGER.debug( + "shortcut for time/datetime found: %s", elem.get("datetime") + ) # updated time - elif elem.get("class") == "updated" and not original_date: + elif not original_date and elem.get("class") == "updated": + shortcut_flag = True LOGGER.debug( - "updated time/datetime found: %s", elem.get("datetime") + "shortcut for updated time/datetime found: %s", + elem.get("datetime"), ) # datetime attribute else: @@ -613,11 +621,9 @@ def examine_time_elements( min_date, max_date, ) - if reference > 0: - break # bare text in element elif elem.text is not None and len(elem.text) > 6: - LOGGER.debug("time/datetime found: %s", elem.text) + LOGGER.debug("time/datetime found in text: %s", elem.text) reference = compare_reference( reference, elem.text, diff --git a/tests/unit_tests.py b/tests/unit_tests.py index da2042f5..0d3149b3 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -392,9 +392,9 @@ def test_exact_date(): ) == "2011-09-27" ) - # problem here: - # assert find_date('', original_date=False) == '2011-09-28' - # assert find_date('', original_date=True) == '2011-09-27' + # updated vs original in time elements + assert find_date('', original_date=False) == '2011-09-28' + assert find_date('', original_date=True) == '2011-09-27' assert ( find_date( '', @@ -1151,7 +1151,7 @@ def test_external_date_parser(): ) assert external_date_parser("Random text with 2020", OUTPUTFORMAT) is None # https://github.com/scrapinghub/dateparser/issues/333 - # assert external_date_parser('1 January 0001', '%d %B %Y') in ('01 January 1', '01 January 0001') + assert external_date_parser('1 January 0001', '%d %B %Y') in ('01 January 1', '01 January 0001') assert external_date_parser("1 January 1900", "%d %B %Y") == "01 January 1900" # https://github.com/scrapinghub/dateparser/issues/406 assert (