Skip to content

Commit

Permalink
Fixes jointakahe#431 - Preserve href when stripping <a> tags (jointak…
Browse files Browse the repository at this point in the history
  • Loading branch information
manfre authored Jan 18, 2023
1 parent bd70769 commit cc75863
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 2 deletions.
33 changes: 31 additions & 2 deletions core/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,35 @@ def __iter__(self):
yield token


class UnlinkifyFilter(Filter):
"""
Forcibly replaces link text with the href.
This is intented to be used when stripping <a> tags to preserve the link
location at the expense of the link text.
"""

def __iter__(self):
discarding_a_text = False
for token in Filter.__iter__(self):
if token.get("name") == "a":
if token["type"] == "EndTag":
discarding_a_text = False
continue
href = token["data"].get((None, "href"))

# If <a> has an href, we use it and throw away all content
# within the <a>...</a>. If href missing or empty, try to find
# text within the <a>...</a>
if href:
yield {"data": href, "type": "Characters"}
discarding_a_text = True
continue
elif not discarding_a_text:
yield token
# else: throw away tokens until we're out of the <a>


def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]:
return True
Expand Down Expand Up @@ -150,7 +179,7 @@ def strip_html(post_html: str, *, linkify: bool = True) -> str:
strip=True,
filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
if linkify
else [],
else [UnlinkifyFilter],
)
return mark_safe(cleaner.clean(post_html))

Expand All @@ -163,7 +192,7 @@ def html_to_plaintext(post_html: str) -> str:
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
# Remove all other HTML and return
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
return cleaner.clean(post_html).strip()


Expand Down
10 changes: 10 additions & 0 deletions tests/core/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ def test_html_to_plaintext():
== "Hi!\n\nHow are\n you?\n\ntoday"
)

assert (
html_to_plaintext(
'<p><a href="https://fedi.takahe.social/with/a/long/path">'
'<b>The</b> <img src="takahe.png"> Link</a> '
'<a href="">Empty href</a> '
"<a>Empty A</a></p>"
)
== "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
)


def test_sanitize_post():

Expand Down

0 comments on commit cc75863

Please sign in to comment.