Skip to content

Commit

Permalink
Support min_date/max_date as datetimes or datetime strings (adbar#74)
Browse files Browse the repository at this point in the history
* Support min_date/max_date as datetimes or datetime strings

* support passing datetimes and including time component, e.g.
  * `find_date(..., min_date=datetime(...))`
  * `find_date(..., max_date="2020-03-14 12:46:58')
* raise warning on invalid argument while preserving previous behavior/result
* correctly respect typing annotation
* consolidate code

* fixup black

* fixup black once again, apologies

* fix mypy

* Project-wide monkey-patch datetime.fromisoformat

for Python <=3.6

* Remove duplicate code

* fixup monkeypatch. Should have used and IDE 😅

* Fix mypy on Py3.6

* review function: check_date_input()

* Add a test case for min_date = datetime object

* extend tests

* update docs

* tests: fix timezone

* cleaning: elif → if

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
kernc and adbar authored Mar 15, 2023
1 parent 5a2448b commit aa2bf3a
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 48 deletions.
4 changes: 2 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,9 @@ For usage instructions see ``htmldate -h``:
name of input file for batch processing (similar to wget -i)
--original original date prioritized
-min MINDATE, --mindate MINDATE
earliest acceptable date (YYYY-MM-DD)
earliest acceptable date (ISO 8601 YMD)
-max MAXDATE, --maxdate MAXDATE
latest acceptable date (YYYY-MM-DD)
latest acceptable date (ISO 8601 YMD)
-u URL, --URL URL custom URL download
-v, --verbose increase output verbosity
--version show version information and exit
Expand Down
8 changes: 8 additions & 0 deletions htmldate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@


import logging
from datetime import datetime

try:
datetime.fromisoformat # type: ignore[attr-defined]
except AttributeError: # Python 3.6
from backports.datetime_fromisoformat import MonkeyPatch # type: ignore

MonkeyPatch.patch_fromisoformat()

from .core import find_date

Expand Down
4 changes: 2 additions & 2 deletions htmldate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ def parse_args(args: Any) -> Any:
"--original", help="original date prioritized", action="store_true"
)
argsparser.add_argument(
"-min", "--mindate", help="earliest acceptable date (YYYY-MM-DD)", type=str
"-min", "--mindate", help="earliest acceptable date (ISO 8601 YMD)", type=str
)
argsparser.add_argument(
"-max", "--maxdate", help="latest acceptable date (YYYY-MM-DD)", type=str
"-max", "--maxdate", help="latest acceptable date (ISO 8601 YMD)", type=str
)
argsparser.add_argument("-u", "--URL", help="custom URL download", type=str)
argsparser.add_argument(
Expand Down
28 changes: 17 additions & 11 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from copy import deepcopy
from datetime import datetime
from functools import lru_cache, partial
from typing import Match, Optional, Pattern, Tuple, Counter as Counter_Type
from typing import Match, Optional, Pattern, Tuple, Union, Counter as Counter_Type

from lxml.html import HtmlElement, tostring # type: ignore

Expand Down Expand Up @@ -263,11 +263,11 @@ def examine_header(
one (e.g. last modified, updated time)
:type original_date: boolean
:param min_date:
Set the earliest acceptable date manually (YYYY-MM-DD format)
:type min_date: string
Set the earliest acceptable date manually (ISO 8601 YMD format)
:type min_date: datetime
:param max_date:
Set the latest acceptable date manually (YYYY-MM-DD format)
:type max_date: string
Set the latest acceptable date manually (ISO 8601 YMD format)
:type max_date: datetime
:return: Returns a valid date expression as a string, or None
"""
Expand Down Expand Up @@ -673,6 +673,12 @@ def search_page(
Look for original date (e.g. publication date) instead of most recent
one (e.g. last modified, updated time)
:type original_date: boolean
:param min_date:
Set the earliest acceptable date manually (ISO 8601 YMD format)
:type min_date: datetime
:param max_date:
Set the latest acceptable date manually (ISO 8601 YMD format)
:type max_date: datetime
:return: Returns a valid date expression as a string, or None
"""
Expand Down Expand Up @@ -941,8 +947,8 @@ def find_date(
outputformat: str = "%Y-%m-%d",
url: Optional[str] = None,
verbose: bool = False,
min_date: Optional[datetime] = None,
max_date: Optional[datetime] = None,
min_date: Optional[Union[datetime, str]] = None,
max_date: Optional[Union[datetime, str]] = None,
deferred_url_extractor: bool = False,
) -> Optional[str]:
"""
Expand Down Expand Up @@ -971,11 +977,11 @@ def find_date(
Set verbosity level for debugging
:type verbose: boolean
:param min_date:
Set the earliest acceptable date manually (YYYY-MM-DD format)
:type min_date: string
Set the earliest acceptable date manually (ISO 8601 YMD format)
:type min_date: datetime, string
:param max_date:
Set the latest acceptable date manually (YYYY-MM-DD format)
:type max_date: string
Set the latest acceptable date manually (ISO 8601 YMD format)
:type max_date: datetime, string
:param deferred_url_extractor:
Use url extractor as backup only to prioritize full expressions,
e.g. of the type `%Y-%m-%d %H:%M:%S`
Expand Down
7 changes: 0 additions & 7 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@

from lxml.html import HtmlElement # type: ignore

try:
datetime.fromisoformat # type: ignore[attr-defined]
except AttributeError: # Python 3.6
from backports.datetime_fromisoformat import MonkeyPatch # type: ignore

MonkeyPatch.patch_fromisoformat()

# own
from .settings import CACHE_SIZE
from .validators import convert_date, date_validator
Expand Down
38 changes: 16 additions & 22 deletions htmldate/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,31 +194,25 @@ def check_extracted_reference(
return None


def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime:
"""Validates the minimum date and/or defaults to earliest plausible date"""
if min_date is not None and isinstance(min_date, str):
def check_date_input(
date_object: Optional[Union[datetime, str]], default: datetime
) -> datetime:
"Check if the input is a usable datetime or ISO date string, return default otherwise"
if isinstance(date_object, datetime):
return date_object
if isinstance(date_object, str):
try:
# internal conversion from Y-M-D format
min_date = datetime(
int(min_date[:4]), int(min_date[5:7]), int(min_date[8:10])
)
return datetime.fromisoformat(date_object) # type: ignore
except ValueError:
min_date = MIN_DATE
else:
min_date = MIN_DATE
return min_date
LOGGER.warning("invalid datetime string: %s", date_object)
return default # no input or error thrown


def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime:
"""Validates the minimum date and/or defaults to earliest plausible date"""
return check_date_input(min_date, MIN_DATE)


def get_max_date(max_date: Optional[Union[datetime, str]]) -> datetime:
"""Validates the maximum date and/or defaults to latest plausible date"""
if max_date is not None and isinstance(max_date, str):
try:
# internal conversion from Y-M-D format
max_date = datetime(
int(max_date[:4]), int(max_date[5:7]), int(max_date[8:10])
)
except ValueError:
max_date = datetime.now()
else:
max_date = datetime.now()
return max_date
return check_date_input(max_date, datetime.now())
43 changes: 39 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,26 @@ def test_input():
mock = Mock()
mock.data = b" "
assert decode_response(mock) is not None

# find_date logic
with pytest.raises(TypeError):
assert find_date(None) is None
assert find_date("<" * 100) is None
assert find_date("<html></html>", verbose=True) is None
assert find_date("<html><body>\u0008this\xdf\n\u001f+\uffff</body></html>") is None

# min and max date output
assert get_min_date("2020-02-20").date() == datetime.date(2020, 2, 20)
assert get_min_date(None).date() == datetime.date(1995, 1, 1)
assert get_min_date("3030-30-50").date() == datetime.date(1995, 1, 1)
assert get_min_date(datetime.datetime(1990, 1, 1)) == datetime.datetime(1990, 1, 1)
assert get_min_date("2020-02-20T13:30:00") == datetime.datetime(2020, 2, 20, 13, 30)

assert get_max_date("2020-02-20").date() == datetime.date(2020, 2, 20)
assert get_max_date(None).date() == datetime.date.today()
assert get_max_date("3030-30-50").date() == datetime.date.today()
assert get_max_date(datetime.datetime(3000, 1, 1)) == datetime.datetime(3000, 1, 1)
assert get_max_date("2020-02-20T13:30:00") == datetime.datetime(2020, 2, 20, 13, 30)


def test_sanity():
Expand Down Expand Up @@ -393,8 +400,20 @@ def test_exact_date():
== "2011-09-27"
)
# updated vs original in time elements
assert find_date('<html><body><time datetime="2011-09-27" class="entry-date"></time><time datetime="2011-09-28" class="updated"></time></body></html>', original_date=False) == '2011-09-28'
assert find_date('<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>', original_date=True) == '2011-09-27'
assert (
find_date(
'<html><body><time datetime="2011-09-27" class="entry-date"></time><time datetime="2011-09-28" class="updated"></time></body></html>',
original_date=False,
)
== "2011-09-28"
)
assert (
find_date(
'<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>',
original_date=True,
)
== "2011-09-27"
)
assert (
find_date(
'<html><body><time datetime="2011-09-28" class="updated"></time><time datetime="2011-09-27" class="entry-date"></time></body></html>',
Expand Down Expand Up @@ -616,6 +635,20 @@ def test_exact_date():
)
== "1991-01-02"
)
assert (
find_date(
'<html><meta><meta property="article:published_time" content="1991-01-02T01:01:00+00:00"></meta><body></body></html>',
min_date="1991-01-02T01:02:00+00:00",
)
is None
)
assert (
find_date(
'<html><meta><meta property="article:published_time" content="1991-01-02T01:01:00+00:00"></meta><body></body></html>',
min_date="1991-01-02T01:00:00+00:00",
)
== "1991-01-02"
)

# wild text in body
assert (
Expand Down Expand Up @@ -1151,7 +1184,10 @@ def test_external_date_parser():
)
assert external_date_parser("Random text with 2020", OUTPUTFORMAT) is None
# https://github.com/scrapinghub/dateparser/issues/333
assert external_date_parser('1 January 0001', '%d %B %Y') in ('01 January 1', '01 January 0001')
assert external_date_parser("1 January 0001", "%d %B %Y") in (
"01 January 1",
"01 January 0001",
)
assert external_date_parser("1 January 1900", "%d %B %Y") == "01 January 1900"
# https://github.com/scrapinghub/dateparser/issues/406
assert (
Expand Down Expand Up @@ -1720,7 +1756,6 @@ def test_dependencies():


if __name__ == "__main__":

# function-level
test_input()
test_sanity()
Expand Down

0 comments on commit aa2bf3a

Please sign in to comment.