Skip to content
This repository has been archived by the owner on Feb 9, 2024. It is now read-only.

Commit

Permalink
Escape control chars even if emitting UTF8 (open-source-parsers#1178)
Browse files Browse the repository at this point in the history
* Escape control chars even if emitting UTF8

See open-source-parsers#1176
Fixes open-source-parsers#1175

* review comments

* fix test by stopping early enough to punt on utf8-input.
  • Loading branch information
BillyDonahue authored May 21, 2020
1 parent 75b360a commit c161f4a
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 22 deletions.
49 changes: 27 additions & 22 deletions src/lib_json/json_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) {
return result;
}

static void appendRaw(String& result, unsigned ch) {
result += static_cast<char>(ch);
}

static void appendHex(String& result, unsigned ch) {
result.append("\\u").append(toHex16Bit(ch));
}

static String valueToQuotedStringN(const char* value, unsigned length,
bool emitUTF8 = false) {
if (value == nullptr)
Expand Down Expand Up @@ -310,29 +318,26 @@ static String valueToQuotedStringN(const char* value, unsigned length,
// sequence from occurring.
default: {
if (emitUTF8) {
result += *c;
unsigned codepoint = static_cast<unsigned char>(*c);
if (codepoint < 0x20) {
appendHex(result, codepoint);
} else {
appendRaw(result, codepoint);
}
} else {
unsigned int codepoint = utf8ToCodepoint(c, end);
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
// don't escape non-control characters
// (short escape sequence are applied above)
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
result += static_cast<char>(codepoint);
} else if (codepoint <
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
// Multilingual Plane
result += "\\u";
result += toHex16Bit(codepoint);
} else { // codepoint is not in Basic Multilingual Plane
// convert to surrogate pair first
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
result += "\\u";
result += toHex16Bit((codepoint >> 10) + 0xD800);
result += "\\u";
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
unsigned codepoint = utf8ToCodepoint(c, end); // modifies `c`
if (codepoint < 0x20) {
appendHex(result, codepoint);
} else if (codepoint < 0x80) {
appendRaw(result, codepoint);
} else if (codepoint < 0x10000) {
// Basic Multilingual Plane
appendHex(result, codepoint);
} else {
// Extended Unicode. Encode 20 bits as a surrogate pair.
codepoint -= 0x10000;
appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff));
appendHex(result, 0xdc00 + (codepoint & 0x3ff));
}
}
} break;
Expand Down
62 changes: 62 additions & 0 deletions src/test_lib_json/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2640,6 +2640,68 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
}

// Control chars should be escaped regardless of UTF-8 input encoding.
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
auto uEscape = [](unsigned ch) {
static const char h[] = "0123456789abcdef";
std::string r = "\\u";
r += h[(ch >> (3 * 4)) & 0xf];
r += h[(ch >> (2 * 4)) & 0xf];
r += h[(ch >> (1 * 4)) & 0xf];
r += h[(ch >> (0 * 4)) & 0xf];
return r;
};
auto shortEscape = [](unsigned ch) -> const char* {
switch (ch) {
case '\"':
return "\\\"";
case '\\':
return "\\\\";
case '\b':
return "\\b";
case '\f':
return "\\f";
case '\n':
return "\\n";
case '\r':
return "\\r";
case '\t':
return "\\t";
default:
return nullptr;
}
};

Json::StreamWriterBuilder b;

for (bool emitUTF8 : {true, false}) {
b.settings_["emitUTF8"] = emitUTF8;

for (unsigned i = 0; i != 0x100; ++i) {
if (!emitUTF8 && i >= 0x80)
break; // The algorithm would try to parse UTF-8, so stop here.

std::string raw({static_cast<char>(i)});
std::string esc = raw;
if (i < 0x20)
esc = uEscape(i);
if (const char* shEsc = shortEscape(i))
esc = shEsc;

// std::cout << "emit=" << emitUTF8 << ", i=" << std::hex << i << std::dec
// << std::endl;

Json::Value root;
root["test"] = raw;
JSONTEST_ASSERT_STRING_EQUAL(
std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"),
Json::writeString(b, root))
<< ", emit=" << emitUTF8 << ", i=" << i << ", raw=\"" << raw << "\""
<< ", esc=\"" << esc << "\"";
}
}
}

struct ReaderTest : JsonTest::TestCase {
void setStrictMode() {
reader = std::unique_ptr<Json::Reader>(
Expand Down

0 comments on commit c161f4a

Please sign in to comment.