Skip to content

Commit

Permalink
Bug 1675626 - Refactor use of CharBuffer in TokenStream. r=tcampbell
Browse files Browse the repository at this point in the history
This makes the CharBuffer-manipulating functions standalone.

Also, a comment in `addLineOfContext` is wrong: it is not necessary to clear
`this->charBuffer` there. The method can just use a new local CharBuffer.

Differential Revision: https://phabricator.services.mozilla.com/D96116
  • Loading branch information
jswalden committed Nov 12, 2020
1 parent 3cf61cc commit 5bfeaaa
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 51 deletions.
60 changes: 28 additions & 32 deletions js/src/frontend/TokenStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,9 @@ TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
: TokenStreamCharsShared(cx, pasrerAtoms),
sourceUnits(units, length, startOffset) {}

template <>
MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
fillCharBufferFromSourceNormalizingAsciiLineBreaks(const char16_t* cur,
const char16_t* end) {
MOZ_ASSERT(this->charBuffer.length() == 0);
bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
CharBuffer& charBuffer, const char16_t* cur, const char16_t* end) {
MOZ_ASSERT(charBuffer.length() == 0);

while (cur < end) {
char16_t ch = *cur++;
Expand All @@ -567,7 +565,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
}
}

if (!this->charBuffer.append(ch)) {
if (!charBuffer.append(ch)) {
return false;
}
}
Expand All @@ -576,11 +574,9 @@ MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
return true;
}

template <>
MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
fillCharBufferFromSourceNormalizingAsciiLineBreaks(const Utf8Unit* cur,
const Utf8Unit* end) {
MOZ_ASSERT(this->charBuffer.length() == 0);
bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
CharBuffer& charBuffer, const Utf8Unit* cur, const Utf8Unit* end) {
MOZ_ASSERT(charBuffer.length() == 0);

while (cur < end) {
Utf8Unit unit = *cur++;
Expand All @@ -593,7 +589,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
}
}

if (!this->charBuffer.append(ch)) {
if (!charBuffer.append(ch)) {
return false;
}

Expand All @@ -604,7 +600,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
MOZ_ASSERT(ch.isSome(),
"provided source text should already have been validated");

if (!appendCodePointToCharBuffer(ch.value())) {
if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
return false;
}
}
Expand Down Expand Up @@ -1736,26 +1732,22 @@ bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
return true;
}

// We might have hit an error while processing some source code feature
// that's accumulating text into |this->charBuffer| -- e.g. we could be
// halfway into a regular expression literal, then encounter invalid UTF-8.
// Thus we must clear |this->charBuffer| of prior work.
this->charBuffer.clear();
CharBuffer lineOfContext(cx);

const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(
encodedWindow, encodedWindow + encodedWindowLength)) {
if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
return false;
}

size_t utf16WindowLength = this->charBuffer.length();
size_t utf16WindowLength = lineOfContext.length();

// The windowed string is null-terminated.
if (!this->charBuffer.append('\0')) {
if (!lineOfContext.append('\0')) {
return false;
}

err->lineOfContext.reset(this->charBuffer.extractOrCopyRawBuffer());
err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
if (!err->lineOfContext) {
return false;
}
Expand Down Expand Up @@ -2080,7 +2072,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
"maintain line-info/flags for EOL");
this->sourceUnits.consumeKnownCodePoint(peeked);

if (!appendCodePointToCharBuffer(peeked.codePoint())) {
if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
return false;
}
} while (true);
Expand Down Expand Up @@ -2167,8 +2159,11 @@ MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
return false;
};

MOZ_MUST_USE bool TokenStreamCharsShared::appendCodePointToCharBuffer(
uint32_t codePoint) {
bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
"should only be processing code points validly decoded from UTF-8 "
"or WTF-16 source text (surrogate code points permitted)");

char16_t units[2];
unsigned numUnits = 0;
unicode::UTF16Encode(codePoint, units, &numUnits);
Expand Down Expand Up @@ -2231,7 +2226,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
}
}

if (!appendCodePointToCharBuffer(codePoint)) {
if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
return false;
}
} while (true);
Expand Down Expand Up @@ -2617,7 +2612,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
return false;
}

return this->appendCodePointToCharBuffer(codePoint);
return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
};

auto ReportUnterminatedRegExp = [this](int32_t unit) {
Expand Down Expand Up @@ -2741,7 +2736,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
if (unit == '_') {
continue;
}
if (!this->appendCodePointToCharBuffer(unit)) {
if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
return false;
}
}
Expand Down Expand Up @@ -3438,7 +3433,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
MOZ_ASSERT(!IsLineTerminator(cp));
}

if (!appendCodePointToCharBuffer(cp)) {
if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
return false;
}

Expand Down Expand Up @@ -3468,7 +3463,8 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
// LineContinuation represents no code points, so don't append
// in this case.
if (codePoint != '\n') {
if (!appendCodePointToCharBuffer(AssertedCast<char32_t>(codePoint))) {
if (!AppendCodePointToCharBuffer(this->charBuffer,
AssertedCast<char32_t>(codePoint))) {
return false;
}
}
Expand Down Expand Up @@ -3595,7 +3591,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
}

MOZ_ASSERT(code <= unicode::NonBMPMax);
if (!appendCodePointToCharBuffer(code)) {
if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
return false;
}

Expand Down
58 changes: 39 additions & 19 deletions js/src/frontend/TokenStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -1502,12 +1502,44 @@ inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
MOZ_ASSERT(last == 0xA8 || last == 0xA9);
}

class TokenStreamCharsShared {
// Using char16_t (not Unit) is a simplifying decision that hopefully
// eliminates the need for a UTF-8 regular expression parser and makes
// |copyCharBufferTo| markedly simpler.
using CharBuffer = Vector<char16_t, 32>;
/**
* An all-purpose buffer type for accumulating text during tokenizing.
*
* In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
* |Unit|. We use |char16_t| because:
*
* * we don't have a UTF-8 regular expression parser, so in general regular
* expression text must be copied to a separate UTF-16 buffer to parse it,
* and
* * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
* |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
*/
using CharBuffer = Vector<char16_t, 32>;

/**
* Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
* code points included) to the buffer.
*/
extern MOZ_MUST_USE bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
uint32_t codePoint);

/**
* Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
* allows them in source text) into |charBuffer|. Normalize '\r', '\n', and
* "\r\n" into '\n'.
*/
extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);

/**
* Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
* Normalize '\r', '\n', and "\r\n" into '\n'.
*/
extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
const mozilla::Utf8Unit* end);

class TokenStreamCharsShared {
protected:
JSContext* cx;

Expand All @@ -1525,8 +1557,6 @@ class TokenStreamCharsShared {
explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
: cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}

MOZ_MUST_USE bool appendCodePointToCharBuffer(uint32_t codePoint);

MOZ_MUST_USE bool copyCharBufferTo(
JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);

Expand Down Expand Up @@ -1654,14 +1684,6 @@ class TokenStreamCharsBase : public TokenStreamCharsShared {
template <typename T>
inline void consumeKnownCodeUnit(T) = delete;

/**
* Accumulate the provided range of already-validated text (valid UTF-8, or
* anything if Unit is char16_t because JS allows lone surrogates) into
* |charBuffer|. Normalize '\r', '\n', and "\r\n" into '\n'.
*/
MOZ_MUST_USE bool fillCharBufferFromSourceNormalizingAsciiLineBreaks(
const Unit* cur, const Unit* end);

/**
* Add a null-terminated line of context to error information, for the line
* in |sourceUnits| that contains |offset|. Also record the window's
Expand Down Expand Up @@ -1916,7 +1938,6 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {

protected:
using CharsBase::addLineOfContext;
using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
using CharsBase::matchCodeUnit;
using CharsBase::matchLineTerminator;
using TokenStreamCharsShared::drainCharBufferIntoAtom;
Expand Down Expand Up @@ -2144,7 +2165,8 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
// Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
// separators don't need special handling.
// https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(cur, end)) {
if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
cur, end)) {
return nullptr;
}

Expand Down Expand Up @@ -2431,10 +2453,8 @@ class MOZ_STACK_CLASS TokenStreamSpecific
private:
using CharsBase::atomizeSourceChars;
using GeneralCharsBase::badToken;
using TokenStreamCharsShared::appendCodePointToCharBuffer;
// Deliberately don't |using| |charBuffer| because of bug 1472569. :-(
using CharsBase::consumeKnownCodeUnit;
using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
using CharsBase::matchCodeUnit;
using CharsBase::matchLineTerminator;
using CharsBase::peekCodeUnit;
Expand Down

0 comments on commit 5bfeaaa

Please sign in to comment.