Skip to content

Commit

Permalink
QTextMarkdownWriter: escape special characters (line or word prefix)
Browse files Browse the repository at this point in the history
Try to avoid writing anything that the parser would misinterpret.
Escape pre-existing backslashes, but not those that are already escaped.
Optimize maybeEscapeFirstChar() slightly and apply it to every line
of output (except in code blocks), not only to new lines created by
word-wrapping.

Since it would be hard to do this without using regular expressions,
the markdown writer feature now depends on the regex feature.

Fixes: QTBUG-96051
Fixes: QTBUG-122083
Pick-to: 6.7
Change-Id: I8d95366501fd31441829081c668f11a3a3a23fe2
Reviewed-by: Axel Spoerl <[email protected]>
Reviewed-by: Qt CI Bot <[email protected]>
  • Loading branch information
ec1oud committed Mar 5, 2024
1 parent 5670d5f commit ca47741
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 8 deletions.
1 change: 1 addition & 0 deletions src/gui/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,7 @@ qt_feature("system-textmarkdownreader" PUBLIC
qt_feature("textmarkdownwriter" PUBLIC
SECTION "Kernel"
LABEL "MarkdownWriter"
CONDITION QT_FEATURE_regularexpression
PURPOSE "Provides a Markdown (CommonMark and GitHub) writer"
)
qt_feature("textodfwriter" PUBLIC
Expand Down
54 changes: 51 additions & 3 deletions src/gui/text/qtextmarkdownwriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "qtextimagehandler_p.h"
#include "qtextmarkdownimporter_p.h"
#include "qloggingcategory.h"
#include <QtCore/QRegularExpression>
#if QT_CONFIG(itemmodel)
#include "qabstractitemmodel.h"
#endif
Expand Down Expand Up @@ -286,15 +287,58 @@ static int adjacentBackticksCount(const QString &s)
return ret;
}

/*! \internal
Escape anything at the beginning of a line of markdown that would be
misinterpreted by a markdown parser, including any period that follows a
number (to avoid misinterpretation as a numbered list item).
https://spec.commonmark.org/0.31.2/#backslash-escapes
*/
static void maybeEscapeFirstChar(QString &s)
{
static const QRegularExpression numericListRe(uR"(\d+([\.)])\s)"_s);
static const QLatin1StringView specialFirstCharacters("#*+-");

QString sTrimmed = s.trimmed();
if (sTrimmed.isEmpty())
return;
char firstChar = sTrimmed.at(0).toLatin1();
if (firstChar == '*' || firstChar == '+' || firstChar == '-') {
int i = s.indexOf(QLatin1Char(firstChar));
QChar firstChar = sTrimmed.at(0);
if (specialFirstCharacters.contains(firstChar)) {
int i = s.indexOf(firstChar); // == 0 unless s got trimmed
s.insert(i, u'\\');
} else {
auto match = numericListRe.match(s, 0, QRegularExpression::NormalMatch,
QRegularExpression::AnchorAtOffsetMatchOption);
if (match.hasMatch())
s.insert(match.capturedStart(1), qtmw_Backslash);
}
}

/*! \internal
Escape unescaped backslashes. Then escape any special character that stands
alone or prefixes a "word", including the \c < that starts an HTML tag.
https://spec.commonmark.org/0.31.2/#backslash-escapes
*/
static void escapeSpecialCharacters(QString &s)
{
static const QRegularExpression backslashRe(uR"([^\\]\\)"_s);
static const QRegularExpression spaceRe(uR"(\s+)"_s);
static const QRegularExpression specialRe(uR"([<!*[`&]+[/\w])"_s);

int i = 0;
while (i >= 0) {
if (int j = s.indexOf(backslashRe, i); j >= 0) {
++j; // we found some char before the backslash that needs escaping
if (s.size() == j + 1 || s.at(j + 1) != qtmw_Backslash)
s.insert(j, qtmw_Backslash);
i = j + 3;
}
if (int j = s.indexOf(specialRe, i); j >= 0 && (j == 0 || s.at(j - 1) != u'\\')) {
s.insert(j, qtmw_Backslash);
i = j + 3;
}
i = s.indexOf(spaceRe, i);
if (i >= 0)
++i; // past the whitespace, if found
}
}

Expand Down Expand Up @@ -504,6 +548,10 @@ int QTextMarkdownWriter::writeBlock(const QTextBlock &block, bool wrap, bool ign
QString fragmentText = frag.fragment().text();
while (fragmentText.endsWith(qtmw_Newline))
fragmentText.chop(1);
if (!(m_fencedCodeBlock || m_indentedCodeBlock)) {
escapeSpecialCharacters(fragmentText);
maybeEscapeFirstChar(fragmentText);
}
if (block.textList()) { // <li>first line</br>continuation</li>
QString newlineIndent =
QString(qtmw_Newline) + QString(m_wrappedLineIndent, qtmw_Space);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void tst_QTextMarkdownImporter::lists_data()
QTest::newRow("hyphen space newline") << "- \n" << 0 << 1 << 1 << true << "- \n";
QTest::newRow("hyphen space letter newline") << "- a\n" << 0 << 1 << 1 << false << "- a\n";
QTest::newRow("hyphen nbsp newline") <<
QString::fromUtf8("-\u00A0\n") << 0 << 1 << 0 << true << "-\u00A0\n\n";
QString::fromUtf8("-\u00A0\n") << 0 << 1 << 0 << true << "\\-\u00A0\n\n";
QTest::newRow("nested empty lists") << "*\n *\n *\n" << 0 << 1 << 1 << true << " * \n";
QTest::newRow("list nested in empty list") << "-\n * a\n" << 0 << 1 << 2 << false << "- \n * a\n";
QTest::newRow("lists nested in empty lists")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ private slots:
void rewriteDocument();
void fromHtml_data();
void fromHtml();
void escapeSpecialCharacters_data();
void escapeSpecialCharacters();

private:
bool isMainFontFixed();
Expand Down Expand Up @@ -822,16 +824,21 @@ void tst_QTextMarkdownWriter::fromHtml_data()
QTest::newRow("code") <<
"<pre class=\"language-pseudocode\">\n#include \"foo.h\"\n\nblock {\n statement();\n}\n\n</pre>" <<
"```pseudocode\n#include \"foo.h\"\n\nblock {\n statement();\n}\n\n```\n\n";
// TODO
// QTest::newRow("escaped number and paren after double newline") <<
// "<p>(The first sentence of this paragraph is a line, the next paragraph has a number</p>13) but that's not part of an ordered list" <<
// "(The first sentence of this paragraph is a line, the next paragraph has a number\n\n13\\) but that's not part of an ordered list\n\n";
QTest::newRow("escaped number and paren after single newline") <<
"<p>(The first sentence of this paragraph is a line, next paragraph has a number 13) but that's not part of an ordered list</p>" <<
"(The first sentence of this paragraph is a line, next paragraph has a number\n13\\) but that's not part of an ordered list\n\n";
QTest::newRow("escaped number and paren after double newline") <<
"<p>(The first sentence of this paragraph is a line, the next paragraph has a number</p>13) but that's not part of an ordered list" <<
"(The first sentence of this paragraph is a line, the next paragraph has a number\n\n13\\) but that's not part of an ordered list\n\n";
QTest::newRow("preformats with embedded backticks") <<
"<pre>none `one` ``two``</pre>plain<pre>```three``` ````four````</pre>plain" <<
"```\nnone `one` ``two``\n\n```\nplain\n\n```\n```three``` ````four````\n\n```\nplain\n\n";
QTest::newRow("list items with and without checkboxes") <<
"<ul><li>bullet</li><li class=\"unchecked\">unchecked item</li><li class=\"checked\">checked item</li></ul>" <<
"- bullet\n- [ ] unchecked item\n- [x] checked item\n";
QTest::newRow("table with backslash in cell") << // QTBUG-96051
"<table><tr><td>1011011 [</td><td>1011100 backslash \\</td></tr></table>" <<
"|1011011 [|1011100 backslash \\\\|";
}

void tst_QTextMarkdownWriter::fromHtml()
Expand All @@ -858,5 +865,56 @@ void tst_QTextMarkdownWriter::fromHtml()
QCOMPARE(output, expectedOutput);
}

void tst_QTextMarkdownWriter::escapeSpecialCharacters_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedOutput");

QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\ baz \\\\";
QTest::newRow("not emphasized") << "*normal* **normal too**" << "\\*normal* \\**normal too**";
QTest::newRow("not code") << "`normal` `normal too`" << "\\`normal` \\`normal too`";
QTest::newRow("code fence") << "```not a fence; ``` no risk here; ```not a fence" // TODO slightly inconsistent
<< "\\```not a fence; ``` no risk here; \\```not a fence";
QTest::newRow("not html") << "<p>not a tag: <br/> nope</p>" << "\\<p>not a tag: \\<br/> nope\\</p>";
QTest::newRow("not a link") << "text [not a link](/foo)" << "text \\[not a link](/foo)";
QTest::newRow("not a circle") << "* polaris" << "\\* polaris";
QTest::newRow("not a square") << "+ groovy" << "\\+ groovy";
QTest::newRow("not a bullet") << "- stayin alive" << "\\- stayin alive";
QTest::newRow("arithmetic") << "1 + 2 - 3 * 4" << "1 + 2 - 3 * 4";
QTest::newRow("not a list") << "1. not a list" << "1\\. not a list";
QTest::newRow("not a list either") << "Jupiter and 10." << "Jupiter and 10.";
QTest::newRow("not a heading") << "# not a heading" << "\\# not a heading";
QTest::newRow("a non-entity") << "&ouml; not a character entity" << "\\&ouml; not a character entity";
}

/*! \internal
If the user types into a Qt-based editor plain text that the
markdown parser would misinterpret, escape it when we save to markdown
to clarify that it's plain text.
https://spec.commonmark.org/0.31.2/#backslash-escapes
*/
void tst_QTextMarkdownWriter::escapeSpecialCharacters() // QTBUG-96051, QTBUG-122083
{
QFETCH(QString, input);
QFETCH(QString, expectedOutput);

document->setPlainText(input);
QString output = documentToUnixMarkdown();

#ifdef DEBUG_WRITE_OUTPUT
{
QFile out("/tmp/" + QLatin1String(QTest::currentDataTag()) + ".md");
out.open(QFile::WriteOnly);
out.write(output.toUtf8());
out.close();
}
#endif

output = output.trimmed();
if (output != expectedOutput && (isMainFontFixed() || isFixedFontProportional()))
QEXPECT_FAIL("", "fixed main font or proportional fixed font (QTBUG-103484)", Continue);
QCOMPARE(output, expectedOutput);
}

QTEST_MAIN(tst_QTextMarkdownWriter)
#include "tst_qtextmarkdownwriter.moc"

0 comments on commit ca47741

Please sign in to comment.