diff --git a/docs/change_log/index.md b/docs/change_log/index.md index d7487a324..aed19e3f0 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -3,6 +3,10 @@ title: Change Log Python-Markdown Change Log ========================= +Under development: version 3.3.5 (a bug-fix release). + +* Make the `slugify_unicode` function not remove diacritical marks (#1118). + Feb 24, 2021: version 3.3.4 (a bug-fix release). * Properly parse unclosed tags in code spans (#1066). diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index d64ec1695..965ba4a83 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -23,16 +23,19 @@ import xml.etree.ElementTree as etree -def slugify(value, separator, encoding='ascii'): +def slugify(value, separator, unicode=False): """ Slugify a string, to make it URL friendly. """ - value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore') - value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower() + if not unicode: + # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty + value = unicodedata.normalize('NFKD', value) + value = value.encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value).strip().lower() return re.sub(r'[{}\s]+'.format(separator), separator, value) def slugify_unicode(value, separator): """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ - return slugify(value, separator, 'utf-8') + return slugify(value, separator, unicode=True) IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 04893e3e4..83c990fd0 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -534,9 +534,9 @@ def testPermalinkWithUnicodeInID(self): from markdown.extensions.toc import slugify_unicode self.assertMarkdownRenders( '# Unicode ヘッダー', - '