Skip to content

Commit

Permalink
toc: Do not remove diacritical marks when slugify_unicode is used
Browse files Browse the repository at this point in the history
Update the existing test and add a new one to make sure that the
behavior of default slugify function has not changed.

Fixes Python-Markdown#1118.
  • Loading branch information
mitya57 authored and waylan committed Mar 24, 2021
1 parent 14c2fa9 commit a114315
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 8 deletions.
4 changes: 4 additions & 0 deletions docs/change_log/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ title: Change Log
Python-Markdown Change Log
=========================

Under development: version 3.3.5 (a bug-fix release).

* Make the `slugify_unicode` function not remove diacritical marks (#1118).

Feb 24, 2021: version 3.3.4 (a bug-fix release).

* Properly parse unclosed tags in code spans (#1066).
Expand Down
11 changes: 7 additions & 4 deletions markdown/extensions/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@
import xml.etree.ElementTree as etree


def slugify(value, separator, encoding='ascii'):
def slugify(value, separator, unicode=False):
""" Slugify a string, to make it URL friendly. """
value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
if not unicode:
# Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
value = unicodedata.normalize('NFKD', value)
value = value.encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value).strip().lower()
return re.sub(r'[{}\s]+'.format(separator), separator, value)


def slugify_unicode(value, separator):
""" Slugify a string, to make it URL friendly while preserving Unicode characters. """
return slugify(value, separator, 'utf-8')
return slugify(value, separator, unicode=True)


IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
Expand Down
18 changes: 14 additions & 4 deletions tests/test_syntax/extensions/test_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,9 +534,9 @@ def testPermalinkWithUnicodeInID(self):
from markdown.extensions.toc import slugify_unicode
self.assertMarkdownRenders(
'# Unicode ヘッダー',
'<h1 id="unicode-ヘッター">' # noqa
'<h1 id="unicode-ヘッダー">' # noqa
'Unicode ヘッダー' # noqa
'<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">&para;</a>' # noqa
'<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">&para;</a>' # noqa
'</h1>', # noqa
extensions=[TocExtension(permalink=True, slugify=slugify_unicode)]
)
Expand All @@ -545,9 +545,19 @@ def testPermalinkWithUnicodeTitle(self):
from markdown.extensions.toc import slugify_unicode
self.assertMarkdownRenders(
'# Unicode ヘッダー',
'<h1 id="unicode-ヘッター">' # noqa
'<h1 id="unicode-ヘッダー">' # noqa
'Unicode ヘッダー' # noqa
'<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">&para;</a>' # noqa
'<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">&para;</a>' # noqa
'</h1>', # noqa
extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)]
)

def testPermalinkWithExtendedLatinInID(self):
self.assertMarkdownRenders(
'# Théâtre',
'<h1 id="theatre">' # noqa
'Théâtre' # noqa
'<a class="headerlink" href="#theatre" title="Permanent link">&para;</a>' # noqa
'</h1>', # noqa
extensions=[TocExtension(permalink=True)]
)

0 comments on commit a114315

Please sign in to comment.