From ca4774131b9b8ee40b4d7f5c1ba296af4700207f Mon Sep 17 00:00:00 2001 From: Shawn Rutledge Date: Fri, 1 Mar 2024 00:39:50 -0700 Subject: [PATCH] QTextMarkdownWriter: escape special characters (line or word prefix) Try to avoid writing anything that the parser would misinterpret. Escape pre-existing backslashes, but not those that are already escaped. Optimize maybeEscapeFirstChar() slightly and apply it to every line of output (except in code blocks), not only to new lines created by word-wrapping. Since it would be hard to do this without using regular expressions, the markdown writer feature now depends on the regex feature. Fixes: QTBUG-96051 Fixes: QTBUG-122083 Pick-to: 6.7 Change-Id: I8d95366501fd31441829081c668f11a3a3a23fe2 Reviewed-by: Axel Spoerl Reviewed-by: Qt CI Bot --- src/gui/configure.cmake | 1 + src/gui/text/qtextmarkdownwriter.cpp | 54 ++++++++++++++- .../tst_qtextmarkdownimporter.cpp | 2 +- .../tst_qtextmarkdownwriter.cpp | 66 +++++++++++++++++-- 4 files changed, 115 insertions(+), 8 deletions(-) diff --git a/src/gui/configure.cmake b/src/gui/configure.cmake index 21d98eab1f5..754ca8d761d 100644 --- a/src/gui/configure.cmake +++ b/src/gui/configure.cmake @@ -1012,6 +1012,7 @@ qt_feature("system-textmarkdownreader" PUBLIC qt_feature("textmarkdownwriter" PUBLIC SECTION "Kernel" LABEL "MarkdownWriter" + CONDITION QT_FEATURE_regularexpression PURPOSE "Provides a Markdown (CommonMark and GitHub) writer" ) qt_feature("textodfwriter" PUBLIC diff --git a/src/gui/text/qtextmarkdownwriter.cpp b/src/gui/text/qtextmarkdownwriter.cpp index 64dd88d82ce..5ab733df005 100644 --- a/src/gui/text/qtextmarkdownwriter.cpp +++ b/src/gui/text/qtextmarkdownwriter.cpp @@ -12,6 +12,7 @@ #include "qtextimagehandler_p.h" #include "qtextmarkdownimporter_p.h" #include "qloggingcategory.h" +#include #if QT_CONFIG(itemmodel) #include "qabstractitemmodel.h" #endif @@ -286,15 +287,58 @@ static int adjacentBackticksCount(const QString &s) return ret; } +/*! \internal + Escape anything at the beginning of a line of markdown that would be + misinterpreted by a markdown parser, including any period that follows a + number (to avoid misinterpretation as a numbered list item). + https://spec.commonmark.org/0.31.2/#backslash-escapes +*/ static void maybeEscapeFirstChar(QString &s) { + static const QRegularExpression numericListRe(uR"(\d+([\.)])\s)"_s); + static const QLatin1StringView specialFirstCharacters("#*+-"); + QString sTrimmed = s.trimmed(); if (sTrimmed.isEmpty()) return; - char firstChar = sTrimmed.at(0).toLatin1(); - if (firstChar == '*' || firstChar == '+' || firstChar == '-') { - int i = s.indexOf(QLatin1Char(firstChar)); + QChar firstChar = sTrimmed.at(0); + if (specialFirstCharacters.contains(firstChar)) { + int i = s.indexOf(firstChar); // == 0 unless s got trimmed s.insert(i, u'\\'); + } else { + auto match = numericListRe.match(s, 0, QRegularExpression::NormalMatch, + QRegularExpression::AnchorAtOffsetMatchOption); + if (match.hasMatch()) + s.insert(match.capturedStart(1), qtmw_Backslash); + } +} + +/*! \internal + Escape unescaped backslashes. Then escape any special character that stands + alone or prefixes a "word", including the \c < that starts an HTML tag. + https://spec.commonmark.org/0.31.2/#backslash-escapes +*/ +static void escapeSpecialCharacters(QString &s) +{ + static const QRegularExpression backslashRe(uR"([^\\]\\)"_s); + static const QRegularExpression spaceRe(uR"(\s+)"_s); + static const QRegularExpression specialRe(uR"([= 0) { + if (int j = s.indexOf(backslashRe, i); j >= 0) { + ++j; // we found some char before the backslash that needs escaping + if (s.size() == j + 1 || s.at(j + 1) != qtmw_Backslash) + s.insert(j, qtmw_Backslash); + i = j + 3; + } + if (int j = s.indexOf(specialRe, i); j >= 0 && (j == 0 || s.at(j - 1) != u'\\')) { + s.insert(j, qtmw_Backslash); + i = j + 3; + } + i = s.indexOf(spaceRe, i); + if (i >= 0) + ++i; // past the whitespace, if found } } @@ -504,6 +548,10 @@ int QTextMarkdownWriter::writeBlock(const QTextBlock &block, bool wrap, bool ign QString fragmentText = frag.fragment().text(); while (fragmentText.endsWith(qtmw_Newline)) fragmentText.chop(1); + if (!(m_fencedCodeBlock || m_indentedCodeBlock)) { + escapeSpecialCharacters(fragmentText); + maybeEscapeFirstChar(fragmentText); + } if (block.textList()) { //
  • first line
    continuation
  • QString newlineIndent = QString(qtmw_Newline) + QString(m_wrappedLineIndent, qtmw_Space); diff --git a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp index c78a1b29cba..53fa8274462 100644 --- a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp +++ b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp @@ -253,7 +253,7 @@ void tst_QTextMarkdownImporter::lists_data() QTest::newRow("hyphen space newline") << "- \n" << 0 << 1 << 1 << true << "- \n"; QTest::newRow("hyphen space letter newline") << "- a\n" << 0 << 1 << 1 << false << "- a\n"; QTest::newRow("hyphen nbsp newline") << - QString::fromUtf8("-\u00A0\n") << 0 << 1 << 0 << true << "-\u00A0\n\n"; + QString::fromUtf8("-\u00A0\n") << 0 << 1 << 0 << true << "\\-\u00A0\n\n"; QTest::newRow("nested empty lists") << "*\n *\n *\n" << 0 << 1 << 1 << true << " * \n"; QTest::newRow("list nested in empty list") << "-\n * a\n" << 0 << 1 << 2 << false << "- \n * a\n"; QTest::newRow("lists nested in empty lists") diff --git a/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp b/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp index 3c51ccdff84..d0ae34d67de 100644 --- a/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp +++ b/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp @@ -45,6 +45,8 @@ private slots: void rewriteDocument(); void fromHtml_data(); void fromHtml(); + void escapeSpecialCharacters_data(); + void escapeSpecialCharacters(); private: bool isMainFontFixed(); @@ -822,16 +824,21 @@ void tst_QTextMarkdownWriter::fromHtml_data() QTest::newRow("code") << "
    \n#include \"foo.h\"\n\nblock {\n    statement();\n}\n\n
    " << "```pseudocode\n#include \"foo.h\"\n\nblock {\n statement();\n}\n\n```\n\n"; - // TODO -// QTest::newRow("escaped number and paren after double newline") << -// "

    (The first sentence of this paragraph is a line, the next paragraph has a number

    13) but that's not part of an ordered list" << -// "(The first sentence of this paragraph is a line, the next paragraph has a number\n\n13\\) but that's not part of an ordered list\n\n"; + QTest::newRow("escaped number and paren after single newline") << + "

    (The first sentence of this paragraph is a line, next paragraph has a number 13) but that's not part of an ordered list

    " << + "(The first sentence of this paragraph is a line, next paragraph has a number\n13\\) but that's not part of an ordered list\n\n"; + QTest::newRow("escaped number and paren after double newline") << + "

    (The first sentence of this paragraph is a line, the next paragraph has a number

    13) but that's not part of an ordered list" << + "(The first sentence of this paragraph is a line, the next paragraph has a number\n\n13\\) but that's not part of an ordered list\n\n"; QTest::newRow("preformats with embedded backticks") << "
    none `one` ``two``
    plain
    ```three``` ````four````
    plain" << "```\nnone `one` ``two``\n\n```\nplain\n\n```\n```three``` ````four````\n\n```\nplain\n\n"; QTest::newRow("list items with and without checkboxes") << "
    • bullet
    • unchecked item
    • checked item
    " << "- bullet\n- [ ] unchecked item\n- [x] checked item\n"; + QTest::newRow("table with backslash in cell") << // QTBUG-96051 + "
    1011011 [1011100 backslash \\
    " << + "|1011011 [|1011100 backslash \\\\|"; } void tst_QTextMarkdownWriter::fromHtml() @@ -858,5 +865,56 @@ void tst_QTextMarkdownWriter::fromHtml() QCOMPARE(output, expectedOutput); } +void tst_QTextMarkdownWriter::escapeSpecialCharacters_data() +{ + QTest::addColumn("input"); + QTest::addColumn("expectedOutput"); + + QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\ baz \\\\"; + QTest::newRow("not emphasized") << "*normal* **normal too**" << "\\*normal* \\**normal too**"; + QTest::newRow("not code") << "`normal` `normal too`" << "\\`normal` \\`normal too`"; + QTest::newRow("code fence") << "```not a fence; ``` no risk here; ```not a fence" // TODO slightly inconsistent + << "\\```not a fence; ``` no risk here; \\```not a fence"; + QTest::newRow("not html") << "

    not a tag:
    nope

    " << "\\

    not a tag: \\
    nope\\

    "; + QTest::newRow("not a link") << "text [not a link](/foo)" << "text \\[not a link](/foo)"; + QTest::newRow("not a circle") << "* polaris" << "\\* polaris"; + QTest::newRow("not a square") << "+ groovy" << "\\+ groovy"; + QTest::newRow("not a bullet") << "- stayin alive" << "\\- stayin alive"; + QTest::newRow("arithmetic") << "1 + 2 - 3 * 4" << "1 + 2 - 3 * 4"; + QTest::newRow("not a list") << "1. not a list" << "1\\. not a list"; + QTest::newRow("not a list either") << "Jupiter and 10." << "Jupiter and 10."; + QTest::newRow("not a heading") << "# not a heading" << "\\# not a heading"; + QTest::newRow("a non-entity") << "ö not a character entity" << "\\ö not a character entity"; +} + +/*! \internal + If the user types into a Qt-based editor plain text that the + markdown parser would misinterpret, escape it when we save to markdown + to clarify that it's plain text. + https://spec.commonmark.org/0.31.2/#backslash-escapes +*/ +void tst_QTextMarkdownWriter::escapeSpecialCharacters() // QTBUG-96051, QTBUG-122083 +{ + QFETCH(QString, input); + QFETCH(QString, expectedOutput); + + document->setPlainText(input); + QString output = documentToUnixMarkdown(); + +#ifdef DEBUG_WRITE_OUTPUT + { + QFile out("/tmp/" + QLatin1String(QTest::currentDataTag()) + ".md"); + out.open(QFile::WriteOnly); + out.write(output.toUtf8()); + out.close(); + } +#endif + + output = output.trimmed(); + if (output != expectedOutput && (isMainFontFixed() || isFixedFontProportional())) + QEXPECT_FAIL("", "fixed main font or proportional fixed font (QTBUG-103484)", Continue); + QCOMPARE(output, expectedOutput); +} + QTEST_MAIN(tst_QTextMarkdownWriter) #include "tst_qtextmarkdownwriter.moc"