Skip to content

Commit

Permalink
Properly parse unclosed tags in code spans
Browse files Browse the repository at this point in the history
* fix unclosed pi in code span
* fix unclosed dec in code span
* fix unclosed tag in code span

Closes Python-Markdown#1066.
  • Loading branch information
waylan authored Nov 23, 2020
1 parent 82ac905 commit 1279074
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/change_log/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Python-Markdown Change Log

Under development: version 3.3.4 (a bug-fix release).

* Properly parse unclosed tags in code spans (#1066).
* Properly parse processing instructions in md_in_html (#1070).
* Properly parse code spans in md_in_html (#1069).

Expand Down
20 changes: 20 additions & 0 deletions markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,26 @@ def handle_empty_tag(self, data, is_block):
else:
self.handle_data(self.md.htmlStash.store(data))

def parse_pi(self, i):
if self.at_line_start() or self.intail or self.mdstack:
# The same override exists in HTMLExtractor without the check
# for mdstack. Therefore, use HTMLExtractor's parent instead.
return super(HTMLExtractor, self).parse_pi(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<?')
return i + 2

def parse_html_declaration(self, i):
if self.at_line_start() or self.intail or self.mdstack:
# The same override exists in HTMLExtractor without the check
# for mdstack. Therefore, use HTMLExtractor's parent instead.
return super(HTMLExtractor, self).parse_html_declaration(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<!')
return i + 2


class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
Expand Down
32 changes: 32 additions & 0 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@
# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
htmlparser.incomplete = htmlparser.entityref
# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
htmlparser.locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^`>\s]* # bare value <= added backtick here
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
""", re.VERBOSE)

# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
Expand Down Expand Up @@ -230,6 +246,22 @@ def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

def parse_pi(self, i):
if self.at_line_start() or self.intail:
return super().parse_pi(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<?')
return i + 2

def parse_html_declaration(self, i):
if self.at_line_start() or self.intail:
return super().parse_html_declaration(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<!')
return i + 2

# The rest has been copied from base class in standard lib to address #1036.
# As __startag_text is private, all references to it must be in this subclass.
# The last few lines of parse_starttag are reversed so that handle_starttag
Expand Down
105 changes: 105 additions & 0 deletions tests/test_syntax/blocks/test_html_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,48 @@ def test_raw_missing_close_bracket(self):
'<p>&lt;foo</p>'
)

def test_raw_unclosed_tag_in_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<div`.
<div>
hello
</div>
"""
),
self.dedent(
"""
<p><code>&lt;div</code>.</p>
<div>
hello
</div>
"""
)
)

def test_raw_unclosed_tag_in_code_span_space(self):
self.assertMarkdownRenders(
self.dedent(
"""
` <div `.
<div>
hello
</div>
"""
),
self.dedent(
"""
<p><code>&lt;div</code>.</p>
<div>
hello
</div>
"""
)
)

def test_raw_attributes(self):
self.assertMarkdownRenders(
'<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>',
Expand Down Expand Up @@ -1073,6 +1115,27 @@ def test_raw_processing_instruction_indented(self):
)
)

def test_raw_processing_instruction_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<?php`
<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;?php</code></p>
<div>
foo
</div>
"""
)
)

def test_raw_declaration_one_line(self):
self.assertMarkdownRenders(
'<!DOCTYPE html>',
Expand Down Expand Up @@ -1110,6 +1173,27 @@ def test_raw_multiline_declaration(self):
)
)

def test_raw_declaration_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<!`
<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;!</code></p>
<div>
foo
</div>
"""
)
)

def test_raw_cdata_one_line(self):
self.assertMarkdownRenders(
'<![CDATA[ document.write(">"); ]]>',
Expand Down Expand Up @@ -1190,6 +1274,27 @@ def test_raw_cdata_indented(self):
)
)

def test_raw_cdata_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<![`
<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;![</code></p>
<div>
foo
</div>
"""
)
)

def test_charref(self):
self.assertMarkdownRenders(
'&sect;',
Expand Down

0 comments on commit 1279074

Please sign in to comment.