Bug 1675626 - Refactor use of CharBuffer in TokenStream. r=tcampbell

This makes the CharBuffer-manipulating functions standalone. Also, a comment in `addLineOfContext` is wrong: it is not necessary to clear `this->charBuffer` there. The method can just use a new local CharBuffer. Differential Revision: https://phabricator.services.mozilla.com/D96116
Thaodan · Nov 12, 2020 · 5bfeaaa · 5bfeaaa
1 parent 3cf61cc
commit 5bfeaaa
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 51 deletions.
diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp
@@ -552,11 +552,9 @@ TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
     : TokenStreamCharsShared(cx, pasrerAtoms),
       sourceUnits(units, length, startOffset) {}
 
-template <>
-MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
-    fillCharBufferFromSourceNormalizingAsciiLineBreaks(const char16_t* cur,
-                                                       const char16_t* end) {
-  MOZ_ASSERT(this->charBuffer.length() == 0);
+bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+    CharBuffer& charBuffer, const char16_t* cur, const char16_t* end) {
+  MOZ_ASSERT(charBuffer.length() == 0);
 
   while (cur < end) {
     char16_t ch = *cur++;
@@ -567,7 +565,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
       }
     }
 
-    if (!this->charBuffer.append(ch)) {
+    if (!charBuffer.append(ch)) {
       return false;
     }
   }
@@ -576,11 +574,9 @@ MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
   return true;
 }
 
-template <>
-MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
-    fillCharBufferFromSourceNormalizingAsciiLineBreaks(const Utf8Unit* cur,
-                                                       const Utf8Unit* end) {
-  MOZ_ASSERT(this->charBuffer.length() == 0);
+bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+    CharBuffer& charBuffer, const Utf8Unit* cur, const Utf8Unit* end) {
+  MOZ_ASSERT(charBuffer.length() == 0);
 
   while (cur < end) {
     Utf8Unit unit = *cur++;
@@ -593,7 +589,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
         }
       }
 
-      if (!this->charBuffer.append(ch)) {
+      if (!charBuffer.append(ch)) {
         return false;
       }
 
@@ -604,7 +600,7 @@ MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
     MOZ_ASSERT(ch.isSome(),
                "provided source text should already have been validated");
 
-    if (!appendCodePointToCharBuffer(ch.value())) {
+    if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
       return false;
     }
   }
@@ -1736,26 +1732,22 @@ bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
     return true;
   }
 
-  // We might have hit an error while processing some source code feature
-  // that's accumulating text into |this->charBuffer| -- e.g. we could be
-  // halfway into a regular expression literal, then encounter invalid UTF-8.
-  // Thus we must clear |this->charBuffer| of prior work.
-  this->charBuffer.clear();
+  CharBuffer lineOfContext(cx);
 
   const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
-  if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(
-          encodedWindow, encodedWindow + encodedWindowLength)) {
+  if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+          lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
     return false;
   }
 
-  size_t utf16WindowLength = this->charBuffer.length();
+  size_t utf16WindowLength = lineOfContext.length();
 
   // The windowed string is null-terminated.
-  if (!this->charBuffer.append('\0')) {
+  if (!lineOfContext.append('\0')) {
     return false;
   }
 
-  err->lineOfContext.reset(this->charBuffer.extractOrCopyRawBuffer());
+  err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
   if (!err->lineOfContext) {
     return false;
   }
@@ -2080,7 +2072,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
                "maintain line-info/flags for EOL");
     this->sourceUnits.consumeKnownCodePoint(peeked);
 
-    if (!appendCodePointToCharBuffer(peeked.codePoint())) {
+    if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
       return false;
     }
   } while (true);
@@ -2167,8 +2159,11 @@ MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
   return false;
 };
 
-MOZ_MUST_USE bool TokenStreamCharsShared::appendCodePointToCharBuffer(
-    uint32_t codePoint) {
+bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
+  MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
+             "should only be processing code points validly decoded from UTF-8 "
+             "or WTF-16 source text (surrogate code points permitted)");
+
   char16_t units[2];
   unsigned numUnits = 0;
   unicode::UTF16Encode(codePoint, units, &numUnits);
@@ -2231,7 +2226,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
       }
     }
 
-    if (!appendCodePointToCharBuffer(codePoint)) {
+    if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
       return false;
     }
   } while (true);
@@ -2617,7 +2612,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
       return false;
     }
 
-    return this->appendCodePointToCharBuffer(codePoint);
+    return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
   };
 
   auto ReportUnterminatedRegExp = [this](int32_t unit) {
@@ -2741,7 +2736,7 @@ MOZ_MUST_USE bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
     if (unit == '_') {
       continue;
     }
-    if (!this->appendCodePointToCharBuffer(unit)) {
+    if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
       return false;
     }
   }
@@ -3438,7 +3433,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
         MOZ_ASSERT(!IsLineTerminator(cp));
       }
 
-      if (!appendCodePointToCharBuffer(cp)) {
+      if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
         return false;
       }
 
@@ -3468,7 +3463,8 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
         // LineContinuation represents no code points, so don't append
         // in this case.
         if (codePoint != '\n') {
-          if (!appendCodePointToCharBuffer(AssertedCast<char32_t>(codePoint))) {
+          if (!AppendCodePointToCharBuffer(this->charBuffer,
+                                           AssertedCast<char32_t>(codePoint))) {
             return false;
           }
         }
@@ -3595,7 +3591,7 @@ bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
             }
 
             MOZ_ASSERT(code <= unicode::NonBMPMax);
-            if (!appendCodePointToCharBuffer(code)) {
+            if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
               return false;
             }
 

diff --git a/js/src/frontend/TokenStream.h b/js/src/frontend/TokenStream.h
@@ -1502,12 +1502,44 @@ inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
   MOZ_ASSERT(last == 0xA8 || last == 0xA9);
 }
 
-class TokenStreamCharsShared {
-  // Using char16_t (not Unit) is a simplifying decision that hopefully
-  // eliminates the need for a UTF-8 regular expression parser and makes
-  // |copyCharBufferTo| markedly simpler.
-  using CharBuffer = Vector<char16_t, 32>;
+/**
+ * An all-purpose buffer type for accumulating text during tokenizing.
+ *
+ * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
+ * |Unit|.  We use |char16_t| because:
+ *
+ *   * we don't have a UTF-8 regular expression parser, so in general regular
+ *     expression text must be copied to a separate UTF-16 buffer to parse it,
+ *     and
+ *   * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
+ *     |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
+ */
+using CharBuffer = Vector<char16_t, 32>;
 
+/**
+ * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
+ * code points included) to the buffer.
+ */
+extern MOZ_MUST_USE bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
+                                                     uint32_t codePoint);
+
+/**
+ * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
+ * allows them in source text) into |charBuffer|.  Normalize '\r', '\n', and
+ * "\r\n" into '\n'.
+ */
+extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+    CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);
+
+/**
+ * Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
+ * Normalize '\r', '\n', and "\r\n" into '\n'.
+ */
+extern MOZ_MUST_USE bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+    CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
+    const mozilla::Utf8Unit* end);
+
+class TokenStreamCharsShared {
  protected:
   JSContext* cx;
 
@@ -1525,8 +1557,6 @@ class TokenStreamCharsShared {
   explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
       : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}
 
-  MOZ_MUST_USE bool appendCodePointToCharBuffer(uint32_t codePoint);
-
   MOZ_MUST_USE bool copyCharBufferTo(
       JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);
 
@@ -1654,14 +1684,6 @@ class TokenStreamCharsBase : public TokenStreamCharsShared {
   template <typename T>
   inline void consumeKnownCodeUnit(T) = delete;
 
-  /**
-   * Accumulate the provided range of already-validated text (valid UTF-8, or
-   * anything if Unit is char16_t because JS allows lone surrogates) into
-   * |charBuffer|.  Normalize '\r', '\n', and "\r\n" into '\n'.
-   */
-  MOZ_MUST_USE bool fillCharBufferFromSourceNormalizingAsciiLineBreaks(
-      const Unit* cur, const Unit* end);
-
   /**
    * Add a null-terminated line of context to error information, for the line
    * in |sourceUnits| that contains |offset|.  Also record the window's
@@ -1916,7 +1938,6 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
 
  protected:
   using CharsBase::addLineOfContext;
-  using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
   using CharsBase::matchCodeUnit;
   using CharsBase::matchLineTerminator;
   using TokenStreamCharsShared::drainCharBufferIntoAtom;
@@ -2144,7 +2165,8 @@ class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
     // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
     // separators don't need special handling.
     // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
-    if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(cur, end)) {
+    if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
+                                                            cur, end)) {
       return nullptr;
     }
 
@@ -2431,10 +2453,8 @@ class MOZ_STACK_CLASS TokenStreamSpecific
  private:
   using CharsBase::atomizeSourceChars;
   using GeneralCharsBase::badToken;
-  using TokenStreamCharsShared::appendCodePointToCharBuffer;
   // Deliberately don't |using| |charBuffer| because of bug 1472569.  :-(
   using CharsBase::consumeKnownCodeUnit;
-  using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
   using CharsBase::matchCodeUnit;
   using CharsBase::matchLineTerminator;
   using CharsBase::peekCodeUnit;