diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp index 9c6bc8d97d87..a3159e1c8605 100644 --- a/js/src/frontend/TokenStream.cpp +++ b/js/src/frontend/TokenStream.cpp @@ -552,11 +552,9 @@ TokenStreamCharsBase::TokenStreamCharsBase(JSContext* cx, : TokenStreamCharsShared(cx, pasrerAtoms), sourceUnits(units, length, startOffset) {} -template <> -MOZ_MUST_USE bool TokenStreamCharsBase:: - fillCharBufferFromSourceNormalizingAsciiLineBreaks(const char16_t* cur, - const char16_t* end) { - MOZ_ASSERT(this->charBuffer.length() == 0); +bool FillCharBufferFromSourceNormalizingAsciiLineBreaks( + CharBuffer& charBuffer, const char16_t* cur, const char16_t* end) { + MOZ_ASSERT(charBuffer.length() == 0); while (cur < end) { char16_t ch = *cur++; @@ -567,7 +565,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase:: } } - if (!this->charBuffer.append(ch)) { + if (!charBuffer.append(ch)) { return false; } } @@ -576,11 +574,9 @@ MOZ_MUST_USE bool TokenStreamCharsBase:: return true; } -template <> -MOZ_MUST_USE bool TokenStreamCharsBase:: - fillCharBufferFromSourceNormalizingAsciiLineBreaks(const Utf8Unit* cur, - const Utf8Unit* end) { - MOZ_ASSERT(this->charBuffer.length() == 0); +bool FillCharBufferFromSourceNormalizingAsciiLineBreaks( + CharBuffer& charBuffer, const Utf8Unit* cur, const Utf8Unit* end) { + MOZ_ASSERT(charBuffer.length() == 0); while (cur < end) { Utf8Unit unit = *cur++; @@ -593,7 +589,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase:: } } - if (!this->charBuffer.append(ch)) { + if (!charBuffer.append(ch)) { return false; } @@ -604,7 +600,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase:: MOZ_ASSERT(ch.isSome(), "provided source text should already have been validated"); - if (!appendCodePointToCharBuffer(ch.value())) { + if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) { return false; } } @@ -1736,26 +1732,22 @@ bool TokenStreamCharsBase::addLineOfContext(ErrorMetadata* err, return true; } - // We might have hit an error while processing some source code feature - // that's accumulating text into |this->charBuffer| -- e.g. we could be - // halfway into a regular expression literal, then encounter invalid UTF-8. - // Thus we must clear |this->charBuffer| of prior work. - this->charBuffer.clear(); + CharBuffer lineOfContext(cx); const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart); - if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks( - encodedWindow, encodedWindow + encodedWindowLength)) { + if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks( + lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) { return false; } - size_t utf16WindowLength = this->charBuffer.length(); + size_t utf16WindowLength = lineOfContext.length(); // The windowed string is null-terminated. - if (!this->charBuffer.append('\0')) { + if (!lineOfContext.append('\0')) { return false; } - err->lineOfContext.reset(this->charBuffer.extractOrCopyRawBuffer()); + err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer()); if (!err->lineOfContext) { return false; } @@ -2080,7 +2072,7 @@ MOZ_MUST_USE bool TokenStreamSpecific::getDirective( "maintain line-info/flags for EOL"); this->sourceUnits.consumeKnownCodePoint(peeked); - if (!appendCodePointToCharBuffer(peeked.codePoint())) { + if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) { return false; } } while (true); @@ -2167,8 +2159,11 @@ MOZ_COLD bool GeneralTokenStreamChars::badToken() { return false; }; -MOZ_MUST_USE bool TokenStreamCharsShared::appendCodePointToCharBuffer( - uint32_t codePoint) { +bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) { + MOZ_ASSERT(codePoint <= unicode::NonBMPMax, + "should only be processing code points validly decoded from UTF-8 " + "or WTF-16 source text (surrogate code points permitted)"); + char16_t units[2]; unsigned numUnits = 0; unicode::UTF16Encode(codePoint, units, &numUnits); @@ -2231,7 +2226,7 @@ bool TokenStreamSpecific::putIdentInCharBuffer( } } - if (!appendCodePointToCharBuffer(codePoint)) { + if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) { return false; } } while (true); @@ -2617,7 +2612,7 @@ MOZ_MUST_USE bool TokenStreamSpecific::regexpLiteral( return false; } - return this->appendCodePointToCharBuffer(codePoint); + return AppendCodePointToCharBuffer(this->charBuffer, codePoint); }; auto ReportUnterminatedRegExp = [this](int32_t unit) { @@ -2741,7 +2736,7 @@ MOZ_MUST_USE bool TokenStreamSpecific::bigIntLiteral( if (unit == '_') { continue; } - if (!this->appendCodePointToCharBuffer(unit)) { + if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) { return false; } } @@ -3438,7 +3433,7 @@ bool TokenStreamSpecific::getStringOrTemplateToken( MOZ_ASSERT(!IsLineTerminator(cp)); } - if (!appendCodePointToCharBuffer(cp)) { + if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) { return false; } @@ -3468,7 +3463,8 @@ bool TokenStreamSpecific::getStringOrTemplateToken( // LineContinuation represents no code points, so don't append // in this case. if (codePoint != '\n') { - if (!appendCodePointToCharBuffer(AssertedCast(codePoint))) { + if (!AppendCodePointToCharBuffer(this->charBuffer, + AssertedCast(codePoint))) { return false; } } @@ -3595,7 +3591,7 @@ bool TokenStreamSpecific::getStringOrTemplateToken( } MOZ_ASSERT(code <= unicode::NonBMPMax); - if (!appendCodePointToCharBuffer(code)) { + if (!AppendCodePointToCharBuffer(this->charBuffer, code)) { return false; } diff --git a/js/src/frontend/TokenStream.h b/js/src/frontend/TokenStream.h index 0c5107a410ed..691cd9eded57 100644 --- a/js/src/frontend/TokenStream.h +++ b/js/src/frontend/TokenStream.h @@ -1502,12 +1502,44 @@ inline void SourceUnits::ungetLineOrParagraphSeparator() { MOZ_ASSERT(last == 0xA8 || last == 0xA9); } -class TokenStreamCharsShared { - // Using char16_t (not Unit) is a simplifying decision that hopefully - // eliminates the need for a UTF-8 regular expression parser and makes - // |copyCharBufferTo| markedly simpler. - using CharBuffer = Vector; +/** + * An all-purpose buffer type for accumulating text during tokenizing. + * + * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or + * |Unit|. We use |char16_t| because: + * + * * we don't have a UTF-8 regular expression parser, so in general regular + * expression text must be copied to a separate UTF-16 buffer to parse it, + * and + * * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared + * |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert. + */ +using CharBuffer = Vector; +/** + * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate + * code points included) to the buffer. + */ +extern MOZ_MUST_USE bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, + uint32_t codePoint); + +/** + * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS + * allows them in source text) into |charBuffer|. Normalize '\r', '\n', and + * "\r\n" into '\n'. + */ +extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks( + CharBuffer& charBuffer, const char16_t* cur, const char16_t* end); + +/** + * Accumulate the range of previously-validated UTF-8 text into |charBuffer|. + * Normalize '\r', '\n', and "\r\n" into '\n'. + */ +extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks( + CharBuffer& charBuffer, const mozilla::Utf8Unit* cur, + const mozilla::Utf8Unit* end); + +class TokenStreamCharsShared { protected: JSContext* cx; @@ -1525,8 +1557,6 @@ class TokenStreamCharsShared { explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms) : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {} - MOZ_MUST_USE bool appendCodePointToCharBuffer(uint32_t codePoint); - MOZ_MUST_USE bool copyCharBufferTo( JSContext* cx, UniquePtr* destination); @@ -1654,14 +1684,6 @@ class TokenStreamCharsBase : public TokenStreamCharsShared { template inline void consumeKnownCodeUnit(T) = delete; - /** - * Accumulate the provided range of already-validated text (valid UTF-8, or - * anything if Unit is char16_t because JS allows lone surrogates) into - * |charBuffer|. Normalize '\r', '\n', and "\r\n" into '\n'. - */ - MOZ_MUST_USE bool fillCharBufferFromSourceNormalizingAsciiLineBreaks( - const Unit* cur, const Unit* end); - /** * Add a null-terminated line of context to error information, for the line * in |sourceUnits| that contains |offset|. Also record the window's @@ -1916,7 +1938,6 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase { protected: using CharsBase::addLineOfContext; - using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks; using CharsBase::matchCodeUnit; using CharsBase::matchLineTerminator; using TokenStreamCharsShared::drainCharBufferIntoAtom; @@ -2144,7 +2165,8 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase { // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode // separators don't need special handling. // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv - if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(cur, end)) { + if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer, + cur, end)) { return nullptr; } @@ -2431,10 +2453,8 @@ class MOZ_STACK_CLASS TokenStreamSpecific private: using CharsBase::atomizeSourceChars; using GeneralCharsBase::badToken; - using TokenStreamCharsShared::appendCodePointToCharBuffer; // Deliberately don't |using| |charBuffer| because of bug 1472569. :-( using CharsBase::consumeKnownCodeUnit; - using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks; using CharsBase::matchCodeUnit; using CharsBase::matchLineTerminator; using CharsBase::peekCodeUnit;